In [2]:
import petroeval as pet

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as colors 
from mpl_toolkits.axes_grid1 import make_axes_locatable

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as sklm

## Reading Data

In [4]:
las = pet.read_lasio('../data/train_wells/ataga 05.las')
test_df = las.df()

In [5]:
test_df.head()

Unnamed: 0_level_0,CALI,DT,GR,ILD,LLS,MSFL,NPHI,RHOB,SP,DEPT_1,...,PHID_ATAGA,PHIDF_ATAGA,VSHF,PHIE_ATAGA,NTG,F_ATAGA,SWIRR_ATAGA,SW_ATAGA,PERM_ATAGA,PERM_EFF_ATAGA
DEPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4347.5,,,77.943199,,,,,,88.919998,1325.118042,...,,,0.227323,,0.772677,,,,,
4348.0,,,77.943199,,,,,,88.925201,1325.270386,...,,,0.227323,,0.772677,,,,,
4348.5,,,77.943199,,,,,,88.930397,1325.422852,...,,,0.227323,,0.772677,,,,,
4349.0,,,77.943199,,,,,,88.969498,1325.575195,...,,,0.227323,,0.772677,,,,,
4349.5,,,77.943199,-0.0569,,,,,89.614197,1325.727539,...,,,0.227323,,0.772677,,,,,


In [16]:
test_df = test_df[['ILD','SP', 'GR', 'NPHI', 'RHOB', 'FLUIDTYPES']]

In [7]:
las1 = pet.read_lasio('../data/train_wells/ataga 07.las')
las2 = pet.read_lasio('../data/train_wells/ataga 10.las')
las3 = pet.read_lasio('../data/train_wells/ataga 11.las')

In [8]:
train_df1 = las1.df()
train_df2 = las2.df()
train_df3 = las3.df()

In [9]:
#Displaying all the colomns withing the dataframe
train_df1.columns

Index(['CALI', 'GR', 'ILD', 'NPHI', 'RHOB', 'SP', 'DEPT_1', 'ATAGA_LITHOLOGY',
       'FLUIDTYPES', 'IGR', 'VSH', 'FL', 'PHID_ATAGA', 'PHIDF_ATAGA', 'VSHF',
       'PHIE_ATAGA', 'NTG', 'F_ATAGA', 'SWIRR_ATAGA', 'SW_ATAGA', 'PERM_ATAGA',
       'PERM_EFF_ATAGA'],
      dtype='object')

In [10]:
#Choosing only the dataframes needed
train_df1 = train_df1[['ILD','SP', 'GR', 'NPHI', 'RHOB', 'FLUIDTYPES']]
train_df2 = train_df2[['ILD','SP', 'GR', 'NPHI', 'RHOB', 'FLUIDTYPES']]
train_df3 = train_df3[['ILD','SP', 'GR', 'NPHI', 'RHOB', 'FLUIDTYPES']]

In [13]:
train_df = pd.concat((train_df1, train_df2, train_df3))

In [15]:
train_df.head()

Unnamed: 0_level_0,ILD,SP,GR,NPHI,RHOB,FLUIDTYPES
DEPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3445.5,32.172798,121.334999,,,,
3446.0,30.4552,121.351898,,,,3.0
3446.5,23.1409,120.771004,,,,3.0
3447.0,20.089199,119.5923,,,,3.0
3447.5,18.907801,118.413597,,,,3.0


In [17]:
train_df.shape

(61997, 6)

In [18]:
test_df.shape

(20018, 6)

## CLEANING DATA FOR EDA

In [22]:
#Determine the percentage of missing values in the train data

train_df.isna().sum()/train_df.shape[0]

ILD           0.328887
SP            0.059084
GR            0.350275
NPHI          0.547817
RHOB          0.512299
FLUIDTYPES    0.149588
dtype: float64

In [24]:
#Determine the percentage of missing values in the test data

test_df.isna().sum()/test_df.shape[0]

ILD           0.051554
SP            0.050904
GR            0.000000
NPHI          0.682086
RHOB          0.669947
FLUIDTYPES    0.001049
dtype: float64

In [37]:
test_df.FLUIDTYPES.value_counts()

3.0    13437
2.0     3492
1.0     3068
Name: FLUIDTYPES, dtype: int64

In [38]:
train_df.FLUIDTYPES.value_counts()

3.0    23326
1.0    17410
2.0    11552
0.0      435
Name: FLUIDTYPES, dtype: int64

In [39]:
train_df.shape[0] - train_df.FLUIDTYPES.value_counts().sum()

9274

In [40]:
test_df.shape[0] - test_df.FLUIDTYPES.value_counts().sum()

21

In [43]:
#Finding the percentage of individual fluidtypes 

train_df.FLUIDTYPES.value_counts()/train_df.shape[0]

3.0    0.376244
1.0    0.280820
2.0    0.186332
0.0    0.007016
Name: FLUIDTYPES, dtype: float64

In [44]:
#Finding the percentage of individual fluidtypes in test dataframe 

test_df.FLUIDTYPES.value_counts()/test_df.shape[0]

3.0    0.671246
2.0    0.174443
1.0    0.153262
Name: FLUIDTYPES, dtype: float64

In [49]:
#Identify which type of data we are working with 

train_df.dtypes

ILD           float64
SP            float64
GR            float64
NPHI          float64
RHOB          float64
FLUIDTYPES    float64
dtype: object

In [50]:
test_df.dtypes

ILD           float64
SP            float64
GR            float64
NPHI          float64
RHOB          float64
FLUIDTYPES    float64
dtype: object

In [58]:
#Using the 'dropna' function to remove the missing values in the FLUIDTYPES colomn. 
#This is because we don't want the filler numbers to distort the target values we will be training our models with. 

train_df.dropna(subset=['FLUIDTYPES'], inplace = True)


In [59]:
train_df.isna().sum()/train_df.shape[0]

ILD           0.211027
SP            0.067105
GR            0.236348
NPHI          0.468562
RHOB          0.426816
FLUIDTYPES    0.000000
dtype: float64

In [60]:
test_df.dropna(subset=['FLUIDTYPES'], inplace = True)

In [61]:
test_df.isna().sum()/test_df.shape[0]

ILD           0.051408
SP            0.050958
GR            0.000000
NPHI          0.681752
RHOB          0.669600
FLUIDTYPES    0.000000
dtype: float64

In [63]:
#Filling the remaining missing values

train_df.fillna(-999, inplace = True)

In [64]:
test_df.fillna(-999, inplace = True)

In [65]:
train_df.isna().sum()/train_df.shape[0]

ILD           0.0
SP            0.0
GR            0.0
NPHI          0.0
RHOB          0.0
FLUIDTYPES    0.0
dtype: float64

In [66]:
test_df.isna().sum()/test_df.shape[0]

ILD           0.0
SP            0.0
GR            0.0
NPHI          0.0
RHOB          0.0
FLUIDTYPES    0.0
dtype: float64