In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import os

In [2]:
#Load dataset file
train_path = r"/Users/astromeria/Documents/SPWLA_competition/data/train.csv"
train_data = pd.read_csv(train_path)
wells_discarded = ["BS", "CALI", "DENC", "ROP"]
targets = ['PHIF', 'VSH', 'SW']
print(train_data.head())
#drop irrelevant columns
train_data = train_data.drop(columns=wells_discarded)
print(train_data.head())

#convert -9999 values to NaN
train_data = train_data.mask(train_data == -9999.0, np.nan)
print(train_data.head())



#drop roww if nan in the target columns
print("shape before dropna on targets:", train_data.shape)
train_data = train_data.dropna(subset=targets)
print(train_data.head())
print("Shape after dropna on targets:", train_data.shape)



   WELLNUM       DEPTH     DTC     DTS      BS    CALI     DEN    DENC  \
0        0  335.160105 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
1        0  335.660105 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
2        0  336.160105 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
3        0  336.660105 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
4        0  337.160105 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   

       GR     NEU     PEF    RDEP    RMED     ROP    PHIF      SW     VSH  
0  5.3274 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0  
1  5.8235 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0  
2  6.5228 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0  
3  7.2285 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0  
4  9.5020 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0  
   WELLNUM       DEPTH     DTC     DTS     DEN      GR     NEU     PEF  \
0        0  335.160105 -9

In [3]:
#function to normalize data
def norm(x, train_stats):
  return (x - train_stats['mean']) / train_stats['std']

#convert resistivity to log
train_data["RDEP"] = np.log10(train_data['RDEP'].abs())
train_data["RMED"] = np.log10(train_data['RMED'].abs())

#normalization of input parameters (excluding targets, wellnum and depth)
#make a copy of the dataset
train_data_copy = train_data.copy()
#drop the columns we don't need to normalize
train_data_copy = train_data_copy.drop(columns=["WELLNUM", "DEPTH"] + targets)
print(train_data_copy.head())
#Normalize data

train_stats = train_data_copy.describe(include = "all")
train_stats = train_stats.transpose()
print("train_stats", train_stats)

train_data_copy = norm(train_data_copy, train_stats)

columns= train_data_copy.columns
train_data[columns] = train_data_copy[columns]
print(train_data.head())

#train_data.to_csv("train_dataset_clean.csv", index = False)




            DTC  DTS     DEN        GR       NEU  PEF      RDEP      RMED
27573   89.6461  NaN  2.5777   88.8573  0.277790  NaN  0.420121  0.396670
27574   96.0152  NaN  2.4748  120.8949  0.294806  NaN  0.393751  0.392433
27575  104.0457  NaN  2.3338  153.5195  0.447361  NaN  0.386659  0.375517
27576  113.5010  NaN  2.2358  182.8531  0.579637  NaN  0.381386  0.382773
27577  117.7817  NaN  2.2278  256.1960  0.556317  NaN  0.368566  0.381602
train_stats         count        mean        std        min         25%         50%  \
DTC   19314.0   76.710857  12.312749   1.025100   68.909410   74.998850   
DTS    7198.0  131.021627  14.551402  74.822400  123.151225  131.898800   
DEN   42309.0    2.411064   0.160632   1.626600    2.264400    2.419800   
GR    42309.0   39.888234  24.822777   4.588597   20.069900   35.799400   
NEU   42306.0    0.181612   0.066775  -0.003400    0.144110    0.176400   
PEF   40429.0    5.185693   1.715554  -0.013100    4.583000    5.414100   
RDEP  42075.0    0.