# Combining two Excel sheets of data

## Combining Chronostratigraphic data into hydrocarbon source well data

###### Import modules and Excel file with multiple sheets

In [1]:
import pandas as pd

In [2]:
file = 'BB_20190227_RE_data_chronostrat_combine_test.xlsx'
data = pd.ExcelFile(file)

###### Extract sheets into separate DataFrames

In [3]:
sheets = data.sheet_names
#sheets

In [4]:
chrono = data.parse(sheets[0])
redata = data.parse(sheets[1])

###### Look for missing data in rows and columns

In [5]:
#Show missing entries
def num_missing(x):
  return sum(x.isnull())

In [6]:
#Applying per column:
print ("Missing values per column:")
print (chrono.apply(num_missing, axis=0)) #axis=0 column

#Applying per row:
print ("\nMissing values per row:")
print (chrono.apply(num_missing, axis=1).head()) #axis=1 row

#Applying per column:
print ("Missing values per column:")
print (redata.apply(num_missing, axis=0)) #axis=0 column

#Applying per row:
print ("\nMissing values per row:")
print (redata.apply(num_missing, axis=1).head()) #axis=1 row

Missing values per column:
Well             0
Period           0
Top_depth_m      0
Base_depth_m     0
Thickness_m      0
Top_age_Ma       0
Bottom_age_Ma    0
dtype: int64

Missing values per row:
0    0
1    0
2    0
3    0
4    0
dtype: int64
Missing values per column:
Well                    0
Depth m RKB             0
Formation (NPD)         0
Type of measurement     0
samplith                0
tmax                    7
s1                      0
s2                      0
s3                     31
toc                     0
HI                      0
pi                      0
company_name            0
dtype: int64

Missing values per row:
0    0
1    0
2    0
3    0
4    0
dtype: int64


In [7]:
print(chrono.shape)
print(redata.shape)

(105, 7)
(303, 13)


###### Merge the two DataFrames

In [8]:
merge = pd.merge(chrono,redata)

In [9]:
wells = redata['Well'].unique()
wells

array(['6406/2-1', '6406/2-2', '6406/3-1'], dtype=object)

In [10]:
len(wells)

3

In [11]:
for num in range(len(wells)):
    print('Number of rows of', wells[num],  len(redata[ redata['Well']==wells[num]]))
    
for num in range(len(wells)):
    print('Number of rows of', wells[num],  len(chrono[ chrono['Well']==wells[num]]))
    
for num in range(len(wells)):
    print('Number of rows of', wells[num],  len(merge[ merge['Well']==wells[num]]))

Number of rows of 6406/2-1 137
Number of rows of 6406/2-2 83
Number of rows of 6406/3-1 83
Number of rows of 6406/2-1 39
Number of rows of 6406/2-2 35
Number of rows of 6406/3-1 31
Number of rows of 6406/2-1 5343
Number of rows of 6406/2-2 2905
Number of rows of 6406/3-1 2573


In [12]:
merge = merge [ (merge['Depth m RKB'] > merge['Top_depth_m']) & (merge['Depth m RKB'] <= merge['Base_depth_m']) ]

In [13]:
merge.drop(['Top_depth_m', 'Base_depth_m', 'Thickness_m','Top_age_Ma', 'Bottom_age_Ma'], axis=1, inplace=True)

In [14]:
merge.index = pd.RangeIndex(len(merge.index))

In [15]:
with pd.ExcelWriter('combine.xlsx') as writer:  # doctest: +SKIP
    merge.to_excel(writer, sheet_name='combined')