In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [3]:
!pip install lasio
import lasio



# Load and Display the Well-log Datasets

In [17]:
df = pd.read_csv("well_train_missing.csv")

In [18]:
# To see what the data set looks like, we'll use the head() method.
df.head(20)

Unnamed: 0,WELL,DEPTH,NPHI,RHOB,GR,RT,PEF,CALI,DT,DTS
0,15_9-F-11A,2600.0,0.371,2.356,82.748,1.323,7.126,8.648,104.605,261.584
1,15_9-F-11A,2600.1,0.341,2.338,79.399,1.196,,8.578,103.827,262.161
2,15_9-F-11A,2600.2,0.308,,74.248,1.171,6.105,8.578,102.74,262.73
3,15_9-F-11A,2600.3,0.283,2.291,68.542,,5.613,8.547,100.943,263.018
4,15_9-F-11A,2600.4,0.272,2.269,60.314,1.107,5.281,8.523,98.473,263.037
5,15_9-F-11A,2600.5,0.266,2.253,51.895,1.214,5.151,,97.171,262.396
6,15_9-F-11A,2600.6,,2.244,45.764,1.317,5.205,8.531,95.441,261.65
7,15_9-F-11A,2600.7,0.282,2.243,,1.299,5.38,,93.204,260.792
8,15_9-F-11A,2600.8,0.29,2.249,40.059,1.186,5.602,8.557,92.143,258.749
9,15_9-F-11A,2600.9,0.303,2.258,41.143,1.077,5.829,8.563,92.043,256.926


In [19]:
# list the data types for each column
print(df.dtypes)

WELL      object
DEPTH    float64
NPHI     float64
RHOB     float64
GR       float64
RT       float64
PEF      float64
CALI     float64
DT       float64
DTS      float64
dtype: object


In [20]:
missing_data = df.isnull()
missing_data.head(20)

Unnamed: 0,WELL,DEPTH,NPHI,RHOB,GR,RT,PEF,CALI,DT,DTS
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False,False
2,False,False,False,True,False,False,False,False,False,False
3,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,True,False,False
6,False,False,True,False,False,False,False,False,False,False
7,False,False,False,False,True,False,False,True,False,False
8,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False


In [None]:
#Count missing values in each column

In [21]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")   

WELL
False    24403
Name: WELL, dtype: int64

DEPTH
False    24403
Name: DEPTH, dtype: int64

NPHI
False    24399
True         4
Name: NPHI, dtype: int64

RHOB
False    24402
True         1
Name: RHOB, dtype: int64

GR
False    24401
True         2
Name: GR, dtype: int64

RT
False    24401
True         2
Name: RT, dtype: int64

PEF
False    24402
True         1
Name: PEF, dtype: int64

CALI
False    24400
True         3
Name: CALI, dtype: int64

DT
False    24403
Name: DT, dtype: int64

DTS
False    23628
True       775
Name: DTS, dtype: int64



<h3 id="deal_missing_values">Deal with missing data</h3>
<b>How to deal with missing data?</b>

<ol>
    <li>drop data<br>
        a. drop the whole row<br>
        b. drop the whole column
    </li>
    <li>replace data<br>
        a. replace it by mean<br>
        b. replace it by frequency<br>
        c. replace it based on other functions
    </li>
</ol>

Calculate the average of the column

In [25]:
avg_NPHI = df["NPHI"].astype("float").mean(axis = 0)
print("Average of NPHI:", avg_NPHI)

Average of NPHI: 0.16995496946596106


Replace "NaN" by mean value in "NPHI" column

In [26]:
df["NPHI"].replace(np.nan, avg_NPHI, inplace=True)

In [27]:
df.head(20)

Unnamed: 0,WELL,DEPTH,NPHI,RHOB,GR,RT,PEF,CALI,DT,DTS
0,15_9-F-11A,2600.0,0.371,2.356,82.748,1.323,7.126,8.648,104.605,261.584
1,15_9-F-11A,2600.1,0.341,2.338,79.399,1.196,,8.578,103.827,262.161
2,15_9-F-11A,2600.2,0.308,,74.248,1.171,6.105,8.578,102.74,262.73
3,15_9-F-11A,2600.3,0.283,2.291,68.542,,5.613,8.547,100.943,263.018
4,15_9-F-11A,2600.4,0.272,2.269,60.314,1.107,5.281,8.523,98.473,263.037
5,15_9-F-11A,2600.5,0.266,2.253,51.895,1.214,5.151,,97.171,262.396
6,15_9-F-11A,2600.6,0.169955,2.244,45.764,1.317,5.205,8.531,95.441,261.65
7,15_9-F-11A,2600.7,0.282,2.243,,1.299,5.38,,93.204,260.792
8,15_9-F-11A,2600.8,0.29,2.249,40.059,1.186,5.602,8.557,92.143,258.749
9,15_9-F-11A,2600.9,0.303,2.258,41.143,1.077,5.829,8.563,92.043,256.926
