In [1]:
# !pip install ipywidgets --upgrade
# !pip install chart-studio --upgrade
# !pip install pyarrow --upgrade
# !pip install cufflinks --upgrade

In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [3]:
import numpy as np
import pandas as pd
import scipy

# Instansiate the Plotly charting library.
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
# We use plotly.offline as this allows us to create interactive 
# visualisations without the use of an internet connection, 
# making our notebook more distributable to others. 
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

# The Cufflinks library allows us to directly bind 
# Pandas dataframes to Plotly charts. 
import cufflinks as cf
# Once again we use the Cufflinks library in offline mode. 
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options. We use these to make our interactive 
# visualisations more aesthetically appealing. 
from IPython.core.display import HTML
pd.options.display.max_rows = 30
pd.options.display.max_columns = 25

# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

PROJ: proj_create_from_database: SQLite error on SELECT name, type, coordinate_system_auth_name, coordinate_system_code, datum_auth_name, datum_code, area_of_use_auth_name, area_of_use_code, text_definition, deprecated FROM geodetic_crs WHERE auth_name = ? AND code = ?: no such column: area_of_use_auth_name


In [4]:
from ipywidgets import interact, interact_manual, widgets

In [5]:
df = pd.read_csv('data/well_4_ML.csv')

In [6]:
df.describe()

Unnamed: 0,DEPT,DEN,DT,GR,NEUT,RES_DEP,PHI,PERM,velocity
count,7586.0,7586.0,7586.0,7352.0,7352.0,7350.0,7586.0,7586.0,7586.0
mean,9460.75,2.326999,118.441103,108.189529,0.423099,18.933387,0.20839,9338.403,8527.682849
std,1095.016952,0.081725,11.92426,23.541196,0.066108,176.943528,0.052728,232813.7,844.76001
min,7564.5,1.8631,84.8112,32.5939,0.0995,0.0812,0.0551,0.01076985,6307.62011
25%,8512.625,2.2726,108.6035,101.771775,0.385175,0.87835,0.1673,1.887876,7762.524203
50%,9460.75,2.3425,115.4427,112.4631,0.4222,1.0256,0.1984,7.905111,8662.306062
75%,10408.875,2.3907,128.824075,120.21925,0.4614,1.3684,0.2435,63.06856,9207.806378
max,11357.0,2.5646,158.5384,219.338,0.6569,1950.0,0.5077,12113130.0,11790.895542


In [7]:
df.isnull().sum()

DEPT          0
DEN           0
DT            0
GR          234
NEUT        234
RES_DEP     236
PHI           0
PERM          0
Facies        0
velocity      0
dtype: int64

In [8]:
df1 = df.dropna()
df1.isnull().sum()

DEPT        0
DEN         0
DT          0
GR          0
NEUT        0
RES_DEP     0
PHI         0
PERM        0
Facies      0
velocity    0
dtype: int64

In [9]:
df1.head()

Unnamed: 0,DEPT,DEN,DT,GR,NEUT,RES_DEP,PHI,PERM,Facies,velocity
236,7682.5,2.2382,137.507,39.0321,0.5983,0.9332,0.2657,175.29364,sand,7272.357044
237,7683.0,2.2382,137.507,39.0321,0.5983,0.9332,0.2657,175.29364,sand,7272.357044
238,7683.5,2.2382,137.507,39.0321,0.5983,0.9332,0.2657,175.29364,sand,7272.357044
239,7684.0,2.2382,137.507,39.0321,0.5983,0.9332,0.2657,175.29364,sand,7272.357044
240,7684.5,2.2382,137.507,39.0321,0.5983,0.9332,0.2657,175.29364,sand,7272.357044


In [10]:
df1.describe()

Unnamed: 0,DEPT,DEN,DT,GR,NEUT,RES_DEP,PHI,PERM,velocity
count,7350.0,7350.0,7350.0,7350.0,7350.0,7350.0,7350.0,7350.0,7350.0
mean,9519.75,2.32985,117.828919,108.208348,0.423052,18.933387,0.20655,9632.62,8567.989909
std,1060.953286,0.081438,11.606285,23.516734,0.066053,176.943528,0.052542,236516.5,827.227799
min,7682.5,1.8631,84.8112,32.5939,0.0995,0.0812,0.0551,0.01076985,6307.62011
25%,8601.125,2.281925,108.4123,101.781475,0.385125,0.87835,0.1666,1.827995,7816.641905
50%,9519.75,2.34695,114.9414,112.4679,0.42215,1.0256,0.1955,6.916941,8700.085435
75%,10438.375,2.3917,127.932175,120.22035,0.461375,1.3684,0.237475,47.78884,9224.045613
max,11357.0,2.5646,158.5384,219.338,0.6569,1950.0,0.5077,12113130.0,11790.895542


In [11]:
from sklearn import preprocessing
from sklearn import utils
lab_enc = preprocessing.LabelEncoder()

In [12]:
df1['Facies'].count()

7350

In [13]:
df1['Facies']= lab_enc.fit_transform(df1['Facies'])

In [14]:
df1['Facies'].unique()

array([1, 3, 2, 0])

In [15]:
@interact
def correlations(column1=list(df1.select_dtypes('number').columns), 
                 column2=list(df1.select_dtypes('number').columns)):
    print(f"Correlation: {df1[column1].corr(df1[column2])}")

interactive(children=(Dropdown(description='column1', options=('DEPT', 'DEN', 'DT', 'GR', 'NEUT', 'RES_DEP', '…

In [16]:
@interact
def scatter_plot(x=list(df1.select_dtypes('number').columns), 
                 y=list(df1.select_dtypes('number').columns)[1:]):
    if x == y:
        print(f"Please select seperate variables for X and Y")
    else:
        df1.iplot(kind='scatter', x=x, y=y, mode='markers', 
                 xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
        ## if you are using Google Colab, comment out the above line of code and uncomment the lines below
        #fig = px.scatter(df, x=x, y=y, title=f'{y.title()} vs {x.title()}')
        #fig.show(renderer="colab")

interactive(children=(Dropdown(description='x', options=('DEPT', 'DEN', 'DT', 'GR', 'NEUT', 'RES_DEP', 'PHI', …

In [18]:
cscales = ['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']

# We use the Figure Factory module of Plotly, which
# defines many unique and powerful plots to be used
# in Python. 
# For more info, see: https://plot.ly/python/figure-factory-subplots/
import plotly.figure_factory as ff

corrs = df1.corr()

@interact
def plot_corrs(colorscale=cscales):
    figure = ff.create_annotated_heatmap(z = corrs.round(2).values, 
                                     x =list(corrs.columns), 
                                     y=list(corrs.index), 
                                     colorscale=colorscale,
                                     annotation_text=corrs.round(2).values)
    iplot(figure)
    ## if you are using Google Colab, comment out the above line of code and uncomment the line below
    #figure.show(renderer="colab")

interactive(children=(Dropdown(description='colorscale', options=('Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Blue…

In [19]:
@interact_manual
def scatter_plot(x=list(df1.select_dtypes('number').columns), 
                 y=list(df1.select_dtypes('number').columns)[1:],
                 theme=list(cf.themes.THEMES.keys()), 
                 colorscale=list(cf.colors._scales_names.keys())):
    
    if x == y:
        print(f"Please select seperate variables for X and Y")
    else:
        df1.iplot(kind='scatter', x=x, y=y, mode='markers', 
                 xTitle=x.title(), yTitle=y.title(), 
                 text='DEPT',
                 title=f'{y.title()} vs {x.title()}',
                theme=theme, colorscale=colorscale)
        ## if you are using Google Colab, comment out the above line of code and uncomment the line below
        #fig = px.scatter(df, x=x, y=y, title=f'{y.title()} vs {x.title()}')
        #fig.show(renderer="colab")

interactive(children=(Dropdown(description='x', options=('DEPT', 'DEN', 'DT', 'GR', 'NEUT', 'RES_DEP', 'PHI', …

In [None]:
final = df1.drop(['NEUT', 'DT'], axis = 'columns')
final

In [None]:
final.describe()

In [None]:
final.isnull().sum()

In [None]:
final.dtypes

In [None]:
X = final.drop('PERM',axis='columns')
y = final.PERM

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X, y)

In [None]:
reg.coef_

In [None]:
# 1=sandstone  2=c_siltstone   3=f_siltstone 
# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
# 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]

def label_facies(row, labels):
    return labels[ row['Facies'] -1]
    
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load well log data

# Define input features and target variable
X = final[['RES_DEP', 'DEN', 'velocity', 'GR', 'Facies', 'DEPT']]
y = final['PERM']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest regressor model
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on test data
y_pred = rf.predict(X_test)

# Calculate model performance metrics
mae = np.mean(abs(y_test - y_pred))
mse = np.mean((y_test - y_pred) ** 2)
rmse = np.sqrt(mse)
r2 = rf.score(X_test, y_test)

# Make a prediction for new well log data
new_data = pd.DataFrame({'RES_DEP': [2.5], 'DEN': [2.3], 'velocity': [3.5], 'GR': [40], 'Facies': [1], 'DEPT': [2000]})
new_pp = rf.predict(new_data)

print(new_pp)