In [28]:
import numpy as np
import pandas as pd
import scipy

In [29]:
df = pd.read_csv('data/combined_freeman_2_3_4.csv')

In [30]:
df.head()

Unnamed: 0,Depth,GR,Log_ILD,RHOB,NPHI,PHI,PERM,well
0,6308.0,86.73,1.342,2.105,0.396,0.33,0.2324,freeman_2
1,6308.5,79.598,1.257,2.105,0.381,0.33,0.2324,freeman_2
2,6309.0,72.18,1.083,2.092,0.365,0.1448,572.4,freeman_2
3,6309.5,68.811,0.973,2.072,0.373,0.1621,749.911,freeman_2
4,6310.0,71.912,0.953,2.057,0.372,0.175,927.986,freeman_2


In [31]:
print(df.columns)

Index(['Depth', 'GR', 'Log_ILD', 'RHOB', 'NPHI', 'PHI', 'PERM', 'well'], dtype='object')


In [14]:
# Define input features and target variable
X = df[['Depth', 'Log_ILD', 'NPHI','RHOB', 'PHI']]
y = df['PERM']

In [34]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import joblib

In [35]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
# Define dictionary of models and their hyperparameters
models = {
    'svm': {
        'model': make_pipeline(StandardScaler(), SVR()),
        'params': {
            'svr__kernel': ['linear', 'rbf', 'poly'],
            'svr__C': [0.1, 1, 10],
            'svr__gamma': ['scale', 'auto']
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'neural_network': {
        'model': MLPRegressor(),
        'params': {
            'hidden_layer_sizes': [(10,), (20,), (30,)],
            'activation': ['relu', 'tanh', 'logistic'],
            'max_iter': [1000, 2000]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'learning_rate': [0.05, 0.1, 0.2],
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7]
        }
    },
    'linear_regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'lasso_regression': {
        'model': Lasso(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10],
            'fit_intercept': [True, False],
            'selection': ['cyclic', 'random'],
            'tol': [0.0001, 0.001, 0.01],
            'max_iter': [1000, 5000, 10000]
        }
    },
    'ridge_regression': {
        'model': Ridge(),
        'params': {
            'alpha': [0.01, 0.1, 1],
            'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
        }
    }
}

In [41]:
scores = []
# Loop over each model and perform grid search with cross-validation to find best hyperparameters 
## scoring='neg_mean_squared_error'
for model_name, model in models.items():
    clf = GridSearchCV(model['model'], model['params'], cv=5, n_jobs=-1, return_train_score=False)
    clf.fit(X_train, y_train)
    best_params = clf.best_params_
    
    # Evaluate best model on test set
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = clf.score(X_test, y_test)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': best_params,
        'RMSE': rmse,
        'R-squared': r2
    })
    
    # Save the best model for each method
    joblib.dump(clf.best_estimator_, f"models/{model_name}_best_model.pkl")



In [42]:
best_model_df = pd.DataFrame(scores, columns=['model', 'best_params', 'best_score', 'RMSE', 'R-squared'])
best_model_df

Unnamed: 0,model,best_params,best_score,RMSE,R-squared
0,svm,"{'svr__C': 10, 'svr__gamma': 'auto', 'svr__ker...",0.355776,23793.780226,0.007277
1,random_forest,"{'max_depth': 20, 'n_estimators': 50}",0.841692,22874.516334,0.082502
2,decision_tree,"{'max_depth': 5, 'min_samples_split': 2}",0.601748,22688.312601,0.097378
3,neural_network,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.298469,23871.1265,0.000812
4,gradient_boosting,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.887884,22743.854549,0.092954
5,linear_regression,{},-1.455236,23853.883132,0.002255
6,lasso_regression,"{'alpha': 10, 'fit_intercept': False, 'max_ite...",-0.814343,23855.441412,0.002125
7,ridge_regression,"{'alpha': 1, 'solver': 'saga'}",-1.435077,23853.770884,0.002265


In [22]:
best_model_df.to_csv('data/best_model.csv', index=False)

In [23]:
# Load the models from the model folder
model_rf = joblib.load('models/random_forest_best_model.pkl')
model_dt = joblib.load('models/decision_tree_best_model.pkl')
model_gb = joblib.load('models/gradient_boosting_best_model.pkl')

In [27]:
# Predict y values for x_test
y_pred_rf = model_rf.predict(X_test)
y_pred_dt = model_dt.predict(X_test)
y_pred_gb = model_gb.predict(X_test)

X_test_df = pd.DataFrame(X_test_selected, columns=['Depth', 'NPHI', 'RHOB', 'vshale', 'PHIeff'])

# Create a dataframe with x_test, y_test, and y_predict
results_df = pd.DataFrame({'Depth': X_test_df['Depth'],
                           'NPHI': X_test_df['NPHI'],
                           'RHOB': X_test_df['RHOB'],
                           'vshale': X_test_df['vshale'],
                           'effective porosity': X_test_df['PHIeff'],
                           'Actual Permeability': y_test,
                           'rf_Permeability': y_pred_rf,
                           'dt_Permeability': y_pred_dt,
                           'gb_Permeability': y_pred_gb})

# Print the dataframe
results_df.to_csv('results_test.csv', index=False)
results_df.head()

Unnamed: 0,Depth,NPHI,RHOB,vshale,effective porosity,Actual Permeability,rf_Permeability,dt_Permeability,gb_Permeability
5935,10674.0,0.3749,2.4032,0.341747,0.085406,0.358137,0.441165,0.309971,1.561161
2300,8851.5,0.4883,2.286,0.280981,0.171649,13.05825,12.522147,12.283858,9.849646
2935,9169.0,0.5025,2.3254,0.251444,0.149407,3.907871,3.937493,4.379332,3.263807
5678,10545.5,0.301,2.3643,0.323522,0.113748,1.180307,1.093683,0.871567,1.561161
2551,8977.0,0.4662,2.3248,0.199873,0.156717,3.980516,3.985074,4.379332,4.388842


In [None]:
# Predict y values for x_train
y_pred_rf_train = model_rf.predict(X_train)
y_pred_dt_train = model_dt.predict(X_train)
y_pred_gb_train = model_gb.predict(X_train)

X_train_df = pd.DataFrame(X_train, columns=['Depth', 'Log_ILD', 'NPHI','RHOB', 'vshale', 'PHIeff', 'swirr', 'Facies_code'])

# Create a dataframe with x_train, y_train, and y_predict
train_df = pd.DataFrame({'Depth': X_train_df['Depth'],
                           'Log_ILD': X_train_df['Log_ILD'],
                           'NPHI': X_train_df['NPHI'],
                           'RHOB': X_train_df['RHOB'],
                           'vshale': X_train_df['vshale'],
                           'swirr': X_train_df['swirr'],
                           'Facies_code': X_train_df['Facies_code'],
                           'effective porosity': X_train_df['PHIeff'],
                           'Actual Permeability': y_train,
                           'rf_Permeability': y_pred_rf_train,
                           'dt_Permeability': y_pred_dt_train,
                           'gb_Permeability': y_pred_gb_train})

# Print the dataframe
train_df.to_csv('results.csv', index=False)
train_df.head()

In [None]:
print(train_df.columns)