In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [2]:
import numpy as np
import pandas as pd
import scipy

# Instansiate the Plotly charting library.
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px

# We use plotly.offline as this allows us to create interactive 
# visualisations without the use of an internet connection, 
# making our notebook more distributable to others. 
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

# The Cufflinks library allows us to directly bind 
# Pandas dataframes to Plotly charts. 
import cufflinks as cf
# Once again we use the Cufflinks library in offline mode. 
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options. We use these to make our interactive 
# visualisations more aesthetically appealing. 
from IPython.core.display import HTML
pd.options.display.max_rows = 30
pd.options.display.max_columns = 25

# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from ipywidgets import interact, interact_manual, widgets

PROJ: proj_create_from_database: SQLite error on SELECT name, type, coordinate_system_auth_name, coordinate_system_code, datum_auth_name, datum_code, area_of_use_auth_name, area_of_use_code, text_definition, deprecated FROM geodetic_crs WHERE auth_name = ? AND code = ?: no such column: area_of_use_auth_name


In [3]:
df = pd.read_csv('data/freeman_well_4_eng.csv')

In [4]:
df.head()

Unnamed: 0,Depth,GR,Log_ILD,DT,RHOB,NPHI,PHI,PERM,Facies,velocity,Facies_code
0,7682.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,1071582.0,0
1,7683.0,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,1071582.0,0
2,7683.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,1071582.0,0
3,7684.0,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,1071582.0,0
4,7684.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,1071582.0,0


In [5]:
df.describe()

Unnamed: 0,Depth,GR,Log_ILD,DT,RHOB,NPHI,PHI,PERM,Facies,velocity,Facies_code
count,7286.0,7286.0,7286.0,7286.0,7286.0,7286.0,7286.0,7286.0,7286.0,7286.0,7286.0
mean,9517.276352,108.163436,16.647403,117.850574,2.330201,0.423016,0.206324,9691.201,2.820615,944788.0,1.820615
std,1056.427949,22.303899,167.04614,11.625088,0.081099,0.065935,0.052322,237549.3,0.564634,405070.3,0.564634
min,7682.5,37.8108,0.0812,84.8112,1.8631,0.0995,0.0551,0.01076985,1.0,512.8205,0.0
25%,8608.125,101.914675,0.8779,108.4025,2.282625,0.3849,0.1667,1.836431,3.0,732292.3,2.0
50%,9518.75,112.48105,1.0259,114.94465,2.3471,0.4221,0.1954,6.885164,3.0,974753.9,2.0
75%,10430.875,120.165375,1.365575,127.9655,2.3916,0.461475,0.237,46.75485,3.0,1139082.0,2.0
max,11357.0,178.3208,1950.0,158.5384,2.5646,0.6569,0.5077,12113130.0,3.0,12315270.0,2.0


In [6]:
df.isnull().sum()

Depth          0
GR             0
Log_ILD        0
DT             0
RHOB           0
NPHI           0
PHI            0
PERM           0
Facies         0
velocity       0
Facies_code    0
dtype: int64

In [7]:
@interact
def correlations(column1=list(df.select_dtypes('number').columns), 
                 column2=list(df.select_dtypes('number').columns)):
    print(f"Correlation: {df[column1].corr(df[column2])}")

interactive(children=(Dropdown(description='column1', options=('Depth', 'GR', 'Log_ILD', 'DT', 'RHOB', 'NPHI',…

In [8]:
@interact
def scatter_plot(x=list(df.select_dtypes('number').columns), 
                 y=list(df.select_dtypes('number').columns)[1:]):
    if x == y:
        print(f"Please select seperate variables for X and Y")
    else:
        df.iplot(kind='scatter', x=x, y=y, mode='markers', 
                 xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
        ## if you are using Google Colab, comment out the above line of code and uncomment the lines below
        #fig = px.scatter(df, x=x, y=y, title=f'{y.title()} vs {x.title()}')
        #fig.show(renderer="colab")

interactive(children=(Dropdown(description='x', options=('Depth', 'GR', 'Log_ILD', 'DT', 'RHOB', 'NPHI', 'PHI'…

In [None]:
cscales = ['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']

# We use the Figure Factory module of Plotly, which
# defines many unique and powerful plots to be used
# in Python. 
# For more info, see: https://plot.ly/python/figure-factory-subplots/
import plotly.figure_factory as ff

corrs = df.corr()

@interact
def plot_corrs(colorscale=cscales):
    figure = ff.create_annotated_heatmap(z = corrs.round(2).values, 
                                     x =list(corrs.columns), 
                                     y=list(corrs.index), 
                                     colorscale=colorscale,
                                     annotation_text=corrs.round(2).values)
    iplot(figure)
    ## if you are using Google Colab, comment out the above line of code and uncomment the line below
    #figure.show(renderer="colab")

In [None]:
@interact_manual
def scatter_plot(x=list(df.select_dtypes('number').columns), 
                 y=list(df.select_dtypes('number').columns)[1:],
                 theme=list(cf.themes.THEMES.keys()), 
                 colorscale=list(cf.colors._scales_names.keys())):
    
    if x == y:
        print(f"Please select seperate variables for X and Y")
    else:
        df.iplot(kind='scatter', x=x, y=y, mode='markers', 
                 xTitle=x.title(), yTitle=y.title(), 
                 text='Depth',
                 title=f'{y.title()} vs {x.title()}',
                theme=theme, colorscale=colorscale)
        ## if you are using Google Colab, comment out the above line of code and uncomment the line below
        #fig = px.scatter(df, x=x, y=y, title=f'{y.title()} vs {x.title()}')
        #fig.show(renderer="colab")

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define dictionary of models and their hyperparameters
models = {
    'svm': {
        'model': make_pipeline(StandardScaler(), SVR()),
        'params': {
            'svr__kernel': ['linear', 'rbf', 'poly'],
            'svr__C': [0.1, 1, 10],
            'svr__gamma': ['scale', 'auto']
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'neural_network': {
        'model': MLPRegressor(),
        'params': {
            'hidden_layer_sizes': [(10,), (20,), (30,)],
            'activation': ['relu', 'tanh', 'logistic']
        }
    }
}

In [None]:
scores = []
# Loop over each model and perform grid search with cross-validation to find best hyperparameters 
## scoring='neg_mean_squared_error'
for model_name, model in models.items():
    clf = GridSearchCV(model['model'], model['params'], cv=5, n_jobs=-1, return_train_score=False)
    clf.fit(X_train, y_train)
    best_params = clf.best_params_
    
    # Evaluate best model on test set
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = clf.score(X_test, y_test)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': best_params,
        'RMSE': rmse,
        'R-squared': r2
    })
    
df = pd.DataFrame(scores,columns=['model', 'best_score', 'best_params', 'RMSE', 'R-squared'])
df

In [None]:
import pickle

In [None]:
with open('ppmodel_pickle', 'wb') as file:
    pickle.dump(model,file)