In [1]:
import numpy as np
import pandas as pd
import scipy

In [2]:
df = pd.read_csv('data/freeman_well_4_eng.csv')

In [3]:
df.head()

Unnamed: 0,Depth,GR,Log_ILD,DT,RHOB,NPHI,PHI,PERM,velocity,GRI,vshale,PHIeff,formation_factor,swirr,permeability,Facies_code
0,7682.5,39.0321,0.9332,137.507,2.2382,0.5983,0.0188,0.002024,7272.357044,0.0,0.0,0.0188,3183.84791,1.261715,314.443398,0
1,7683.0,39.0321,0.9332,137.507,2.2382,0.5983,0.0188,0.002024,7272.357044,0.0,0.0,0.0188,3183.84791,1.261715,314.443398,0
2,7683.5,39.0321,0.9332,137.507,2.2382,0.5983,0.0188,0.002024,7272.357044,0.0,0.0,0.0188,3183.84791,1.261715,314.443398,0
3,7684.0,39.0321,0.9332,137.507,2.2382,0.5983,0.0188,0.002024,7272.357044,0.0,0.0,0.0188,3183.84791,1.261715,314.443398,0
4,7684.5,39.0321,0.9332,137.507,2.2382,0.5983,0.0188,0.002024,7272.357044,0.0,0.0,0.0188,3183.84791,1.261715,314.443398,0


In [4]:
from sklearn import preprocessing
from sklearn import utils
lab_enc = preprocessing.LabelEncoder()

In [5]:
df['Facies_code'].count()

7265

In [6]:
df['Facies']= lab_enc.fit_transform(df['Facies_code'])

In [7]:
df['Facies'].unique()

array([0, 1, 2], dtype=int64)

In [8]:
print(df.columns)

Index(['Depth', 'GR', 'Log_ILD', 'DT', 'RHOB', 'NPHI', 'PHI', 'PERM',
       'velocity', 'GRI', 'vshale', 'PHIeff', 'formation_factor', 'swirr',
       'permeability', 'Facies_code', 'Facies'],
      dtype='object')


In [14]:
# Define input features and target variable
X = df[['Depth', 'Log_ILD', 'NPHI','RHOB', 'vshale', 'PHIeff', 'swirr', 'Facies_code']]
y = df['PERM']

In [16]:
from sklearn.feature_selection import SelectKBest, f_regression

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Select top 5 features based on F-value scores
selector = SelectKBest(f_regression, k=5)
X_new = selector.fit_transform(X, y)

# Get the indices of the selected features
selected_features = X.columns[selector.get_support(indices=True)].tolist()

# Split data into training and testing sets using selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [17]:
selected_features

['Depth', 'NPHI', 'RHOB', 'vshale', 'PHIeff']

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import joblib

In [19]:
# Define dictionary of models and their hyperparameters
models = {
    'svm': {
        'model': make_pipeline(StandardScaler(), SVR()),
        'params': {
            'svr__kernel': ['linear', 'rbf', 'poly'],
            'svr__C': [0.1, 1, 10],
            'svr__gamma': ['scale', 'auto']
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'neural_network': {
        'model': MLPRegressor(),
        'params': {
            'hidden_layer_sizes': [(10,), (20,), (30,)],
            'activation': ['relu', 'tanh', 'logistic'],
            'max_iter': [1000, 2000]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'learning_rate': [0.05, 0.1, 0.2],
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7]
        }
    }
}

In [20]:
scores = []
# Loop over each model and perform grid search with cross-validation to find best hyperparameters 
## scoring='neg_mean_squared_error'
for model_name, model in models.items():
    clf = GridSearchCV(model['model'], model['params'], cv=5, n_jobs=-1, return_train_score=False)
    clf.fit(X_train_selected, y_train)
    best_params = clf.best_params_
    
    # Evaluate best model on test set
    y_pred = clf.predict(X_test_selected)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = clf.score(X_test_selected, y_test)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': best_params,
        'RMSE': rmse,
        'R-squared': r2
    })
    
    # Save the best model for each method
    joblib.dump(clf.best_estimator_, f"models/{model_name}_best_model.pkl")



In [21]:
best_model_df = pd.DataFrame(scores, columns=['model', 'best_params', 'best_score', 'RMSE', 'R-squared'])
best_model_df

Unnamed: 0,model,best_params,best_score,RMSE,R-squared
0,svm,"{'svr__C': 10, 'svr__gamma': 'scale', 'svr__ke...",0.176697,1425.542319,0.081623
1,random_forest,"{'max_depth': 10, 'n_estimators': 150}",0.684906,762.484097,0.737262
2,decision_tree,"{'max_depth': 10, 'min_samples_split': 2}",0.70456,383.565752,0.933512
3,neural_network,"{'activation': 'logistic', 'hidden_layer_sizes...",-0.001263,1487.710047,-0.000225
4,gradient_boosting,"{'learning_rate': 0.2, 'max_depth': 3, 'n_esti...",0.721061,375.107152,0.936413


In [22]:
best_model_df.to_csv('data/best_model.csv', index=False)

In [23]:
# Load the models from the model folder
model_rf = joblib.load('models/random_forest_best_model.pkl')
model_dt = joblib.load('models/decision_tree_best_model.pkl')
model_gb = joblib.load('models/gradient_boosting_best_model.pkl')

In [24]:
# Predict y values for x_test
y_pred_rf = model_rf.predict(X_test)
y_pred_dt = model_dt.predict(X_test)
y_pred_gb = model_gb.predict(X_test)

X_test_df = pd.DataFrame(X_test, columns=['Depth', 'Log_ILD', 'NPHI','RHOB', 'vshale', 'PHIeff', 'swirr', 'Facies_code'])

# Create a dataframe with x_test, y_test, and y_predict
results_df = pd.DataFrame({'Depth': X_test_df['Depth'],
                           'Log_ILD': X_test_df['Log_ILD'],
                           'NPHI': X_test_df['NPHI'],
                           'RHOB': X_test_df['RHOB'],
                           'vshale': X_test_df['vshale'],
                           'swirr': X_test_df['swirr'],
                           'Facies_code': X_test_df['Facies_code'],
                           'effective porosity': X_test_df['PHIeff'],
                           'Actual Permeability': y_test,
                           'rf_Permeability': y_pred_rf,
                           'dt_Permeability': y_pred_dt,
                           'gb_Permeability': y_pred_gb})

# Print the dataframe
results_df.to_csv('results_test.csv', index=False)
results_df.head()

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Facies_code
- Log_ILD
- swirr


In [None]:
# Predict y values for x_train
y_pred_rf_train = model_rf.predict(X_train)
y_pred_dt_train = model_dt.predict(X_train)
y_pred_gb_train = model_gb.predict(X_train)

X_train_df = pd.DataFrame(X_train, columns=['Depth', 'Log_ILD', 'NPHI','RHOB', 'vshale', 'PHIeff', 'swirr', 'Facies_code'])

# Create a dataframe with x_train, y_train, and y_predict
train_df = pd.DataFrame({'Depth': X_train_df['Depth'],
                           'Log_ILD': X_train_df['Log_ILD'],
                           'NPHI': X_train_df['NPHI'],
                           'RHOB': X_train_df['RHOB'],
                           'vshale': X_train_df['vshale'],
                           'swirr': X_train_df['swirr'],
                           'Facies_code': X_train_df['Facies_code'],
                           'effective porosity': X_train_df['PHIeff'],
                           'Actual Permeability': y_train,
                           'rf_Permeability': y_pred_rf_train,
                           'dt_Permeability': y_pred_dt_train,
                           'gb_Permeability': y_pred_gb_train})

# Print the dataframe
train_df.to_csv('results.csv', index=False)
train_df.head()

In [None]:
print(train_df.columns)