In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from class2 import facies_classification, Petrophysics, rename_columns, rearrange_columns

In [2]:
well = pd.read_csv('data/freeman_well_4_eng.csv')
well.head()

Unnamed: 0,Depth,GR,Log_ILD,DT,RHOB,NPHI,PHI,PERM,Facies,velocity,Facies_code
0,7682.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0
1,7683.0,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0
2,7683.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0
3,7684.0,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0
4,7684.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0


In [3]:
test = Petrophysics(well)

In [4]:
y = test.shale_volume(1)

In [5]:
z = test.porosity_effective(125.5)

In [6]:
well.head()

Unnamed: 0,Depth,GR,Log_ILD,DT,RHOB,NPHI,PHI,PERM,Facies,velocity,Facies_code,vshale,effective porosity
0,7682.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0,0.0,0.2657
1,7683.0,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0,0.0,0.2657
2,7683.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0,0.0,0.2657
3,7684.0,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0,0.0,0.2657
4,7684.5,39.0321,0.9332,137.507,2.2382,0.5983,0.2657,175.29364,1.0,7272.357044,0,0.0,0.2657


In [7]:
df = well.dropna()
df.isnull().sum()

Depth                 0
GR                    0
Log_ILD               0
DT                    0
RHOB                  0
NPHI                  0
PHI                   0
PERM                  0
Facies                0
velocity              0
Facies_code           0
vshale                0
effective porosity    0
dtype: int64

In [8]:
X = df[['Log_ILD', 'NPHI','RHOB', 'vshale', 'effective porosity']]
y = df['PERM']

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [30]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models to evaluate
models = {
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'learning_rate': [0.05, 0.1, 0.2],
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7]
        }
    }
}

In [32]:
import joblib


# Perform grid search on each model using cross-validation
results = []
for name, model in models.items():
    print(f"Evaluating {name} model...")
    clf = GridSearchCV(model['model'], model['params'], cv=5, n_jobs=-1)
    clf.fit(X_train, y_train)
    print(f"Best {name} Model Parameters: {clf.best_params_}")
    print(f"Best {name} Model Score: {clf.best_score_}")
    results.append({
        'model': model_name,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'RMSE': np.sqrt(-grid_search.cv_results_['mean_test_neg_mean_squared_error'][grid_search.best_index_]),
        'R-squared': grid_search.cv_results_['mean_test_r2'][grid_search.best_index_]
    })
    
    # Save the best model for each method
    joblib.dump(clf.best_estimator_, f"{name}_best_model.pkl")

result_df = pd.DataFrame(results, columns=['model', 'best_params', 'best_score', 'RMSE', 'R-squared'])
result_df

Evaluating random_forest model...
Best random_forest Model Parameters: {'max_depth': 20, 'n_estimators': 50}
Best random_forest Model Score: 0.8813438136284022
Evaluating decision_tree model...
Best decision_tree Model Parameters: {'max_depth': 10, 'min_samples_split': 5}
Best decision_tree Model Score: 0.8152051501405552
Evaluating gradient_boosting model...
Best gradient_boosting Model Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Best gradient_boosting Model Score: 0.7846496407242352


Unnamed: 0,model,best_params,best_score,RMSE,R-squared
0,gradient_boosting,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.86637,170590.312821,0.86637
1,gradient_boosting,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.86637,170590.312821,0.86637
2,gradient_boosting,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.86637,170590.312821,0.86637


In [16]:
# Print the results
for result in results:
    print('Model:', result['model'])
    print('Best Parameters:', result['best_params'])
    print('Best Score (R2):', result['best_score'])
    print()

Model: svm
Best Parameters: {'svr__C': 10, 'svr__gamma': 'scale', 'svr__kernel': 'rbf'}
Best Score (R2): 0.20012708950223343

Model: random_forest
Best Parameters: {'max_depth': 30, 'n_estimators': 100}
Best Score (R2): 0.8749083487988756

Model: decision_tree
Best Parameters: {'max_depth': 15, 'min_samples_split': 5}
Best Score (R2): 0.8499895155570891

Model: neural_network
Best Parameters: {'activation': 'logistic', 'hidden_layer_sizes': (10,)}
Best Score (R2): -0.9083470886493459

Model: gradient_boosting
Best Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
Best Score (R2): 0.8666474504219849



In [25]:
from tensorflow import keras
from tensorflow.keras import layers

# assume your original model definition is as follows:
model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(8,)),
    layers.Dense(1)
])

# modify the input_shape parameter of the first layer to (5,)
model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(5,)),
    layers.Dense(1)
])


In [27]:
from keras.models import Sequential
from keras.layers import Dense, Dropout


# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# build model
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1))

# compile model
model.compile(optimizer='adam', loss='mean_squared_error')

# train model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# evaluate model on test set
loss = model.evaluate(X_test, y_test)
print('Test Loss:', loss)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 116393492480.0


In [None]:
# Evaluate the model on the test data
scores = model.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
model = RandomForestRegressor(max_depth = 30, n_estimators = 150)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
import pickle

In [None]:
with open('perm1', 'wb') as file:
    pickle.dump(model,file)