In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
df_og = pd.read_pickle('P04.pkl')
df_og    # WMSD after KDE

In [None]:
df = df_og[['Radius', 'ANT', 'GXYX', 'CURVE', 'WellMicroSeismicData']]
df.describe()

In [None]:
train_set = pd.read_pickle('train_set.pkl')
test_set = pd.read_pickle('test_set.pkl')

X = df[['Radius', 'ANT', 'GXYX', 'CURVE']].copy()
y = df['WellMicroSeismicData'].copy()
X_train = train_set[['Radius', 'ANT', 'GXYX', 'CURVE']].copy()    # X_train: feature vairables in training dataset
y_train = train_set['WellMicroSeismicData'].copy()    # y_train : response variable in training dataset
X_test = test_set[['Radius', 'ANT', 'GXYX', 'CURVE']].copy()    # X_test: feature vairables in testing dataset
y_test = test_set['WellMicroSeismicData'].copy()    # y_test : response variable in testing dataset
X_train

In [None]:
X_s = StandardScaler().fit_transform(X)
X_train_s = StandardScaler().fit_transform(X_train)
X_test_s = StandardScaler().fit_transform(X_test)
X_s

### Regression model selection and parameter tuning

**Final Decision**: 
1. Random Forest, depth=15
2. Extra Trees with Bagging, depth=21

#### 1. Random Forest, depth=15

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
rf_reg.fit(X_train_s, y_train)

rf_pred = rf_reg.predict(X_train_s)
rf_mse = mean_squared_error(y_train, rf_pred)
rf_r2 = r2_score(y_train, rf_pred)

rf_pred_test = rf_reg.predict(X_test_s)
rf_mse_test = mean_squared_error(y_test, rf_pred_test)
rf_r2_test = r2_score(y_test, rf_pred_test)


print(rf_mse, rf_r2, rf_mse_test, rf_r2_test, rf_reg.feature_importances_)
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes[0].scatter(y_train, rf_pred, alpha = 0.02)
axes[0].plot((0, 0.4), (0, 0.4))
axes[1].scatter(y_test, rf_pred_test, alpha = 0.05)
axes[1].plot((0, 0.4), (0, 0.4))

In [None]:
def train_and_evaluate_rf(max_depth=None):
    rf_reg = RandomForestRegressor(max_depth=max_depth, random_state=42)
    rf_reg.fit(X_train_s, y_train)

    rf_pred = rf_reg.predict(X_train_s)
    rf_mse = mean_squared_error(y_train, rf_pred)
    rf_r2 = r2_score(y_train, rf_pred)

    rf_pred_test = rf_reg.predict(X_test_s)
    rf_mse_test = mean_squared_error(y_test, rf_pred_test)
    rf_r2_test = r2_score(y_test, rf_pred_test)

    
    return rf_mse, rf_mse_test, rf_r2, rf_r2_test

In [None]:
max_depths = [1, 10, 15, 20, 25, 30]
train_mse = []
test_mse = []
train_r2 = []
test_r2 = []

for i in max_depths:
    a, b, c, d = train_and_evaluate_rf(max_depth=i)
    train_mse.append(a)
    test_mse.append(b)
    train_r2.append(c)
    test_r2.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_mse, label='Training MSE')
plt.plot(max_depths, test_mse, label='Test MSE')
plt.xlabel('Maximum Depth')
plt.ylabel('MSE')
plt.title('MSE vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_r2, label='Training R2')
plt.plot(max_depths, test_r2, label='Test R2')
plt.xlabel('Maximum Depth')
plt.ylabel('R2')
plt.title('R2 vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.show()

train_mse, test_mse, train_r2, test_r2

In [None]:
max_depths = [13, 14, 15, 16, 17]
train_mse = []
test_mse = []
train_r2 = []
test_r2 = []

for i in max_depths:
    a, b, c, d = train_and_evaluate_rf(max_depth=i)
    train_mse.append(a)
    test_mse.append(b)
    train_r2.append(c)
    test_r2.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_mse, label='Training MSE')
plt.plot(max_depths, test_mse, label='Test MSE')
plt.xlabel('Maximum Depth')
plt.ylabel('MSE')
plt.title('MSE vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_r2, label='Training R2')
plt.plot(max_depths, test_r2, label='Test R2')
plt.xlabel('Maximum Depth')
plt.ylabel('R2')
plt.title('R2 vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.show()

train_mse, test_mse, train_r2, test_r2

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(max_depth=15, random_state=42)
rf_reg.fit(X_train_s, y_train)

rf_pred = rf_reg.predict(X_train_s)
rf_mse = mean_squared_error(y_train, rf_pred)
rf_r2 = r2_score(y_train, rf_pred)

rf_pred_test = rf_reg.predict(X_test_s)
rf_mse_test = mean_squared_error(y_test, rf_pred_test)
rf_r2_test = r2_score(y_test, rf_pred_test)


print(rf_mse, rf_r2, rf_mse_test, rf_r2_test, rf_reg.feature_importances_)
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes[0].scatter(y_train, rf_pred, alpha = 0.02)
axes[0].plot((0, 0.4), (0, 0.4))
axes[1].scatter(y_test, rf_pred_test, alpha = 0.05)
axes[1].plot((0, 0.4), (0, 0.4))

#### 2. Extra Trees with Bagging, depth=21

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

etbag_reg = BaggingRegressor(ExtraTreesRegressor())
etbag_reg.fit(X_train_s, y_train)

etbag_pred = etbag_reg.predict(X_train_s)
etbag_mse = mean_squared_error(y_train, etbag_pred)
etbag_r2 = r2_score(y_train, etbag_pred)

etbag_pred_test = etbag_reg.predict(X_test_s)
etbag_mse_test = mean_squared_error(y_test, etbag_pred_test)
etbag_r2_test = r2_score(y_test, etbag_pred_test)


print(etbag_mse, etbag_r2, etbag_mse_test, etbag_r2_test)
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes[0].scatter(y_train, etbag_pred, alpha = 0.02)
axes[0].plot((0, 0.4), (0, 0.4))
axes[1].scatter(y_test, etbag_pred_test, alpha = 0.05)
axes[1].plot((0, 0.4), (0, 0.4))

In [None]:
def train_and_evaluate_etbag(max_depth=None):
    etbag_reg = BaggingRegressor(ExtraTreesRegressor(max_depth=max_depth, random_state=42))
    etbag_reg.fit(X_train_s, y_train)

    etbag_pred = etbag_reg.predict(X_train_s)
    etbag_mse = mean_squared_error(y_train, etbag_pred)
    etbag_r2 = r2_score(y_train, etbag_pred)

    etbag_pred_test = etbag_reg.predict(X_test_s)
    etbag_mse_test = mean_squared_error(y_test, etbag_pred_test)
    etbag_r2_test = r2_score(y_test, etbag_pred_test)

    
    return etbag_mse, etbag_mse_test, etbag_r2, etbag_r2_test

In [None]:
max_depths = [1, 10, 20, 30, 40]
train_mse = []
test_mse = []
train_r2 = []
test_r2 = []

for i in max_depths:
    a, b, c, d = train_and_evaluate_etbag(max_depth=i)
    train_mse.append(a)
    test_mse.append(b)
    train_r2.append(c)
    test_r2.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_mse, label='Training MSE')
plt.plot(max_depths, test_mse, label='Test MSE')
plt.xlabel('Maximum Depth')
plt.ylabel('MSE')
plt.title('MSE vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_r2, label='Training R2')
plt.plot(max_depths, test_r2, label='Test R2')
plt.xlabel('Maximum Depth')
plt.ylabel('R2')
plt.title('R2 vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.show()

train_mse, test_mse, train_r2, test_r2

In [None]:
max_depths = [18, 19, 20, 21, 22, 23, 24, 25]
train_mse = []
test_mse = []
train_r2 = []
test_r2 = []

for i in max_depths:
    a, b, c, d = train_and_evaluate_etbag(max_depth=i)
    train_mse.append(a)
    test_mse.append(b)
    train_r2.append(c)
    test_r2.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_mse, label='Training MSE')
plt.plot(max_depths, test_mse, label='Test MSE')
plt.xlabel('Maximum Depth')
plt.ylabel('MSE')
plt.title('MSE vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_r2, label='Training R2')
plt.plot(max_depths, test_r2, label='Test R2')
plt.xlabel('Maximum Depth')
plt.ylabel('R2')
plt.title('R2 vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.show()

train_mse, test_mse, train_r2, test_r2

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

etbag_reg = BaggingRegressor(ExtraTreesRegressor(max_depth=22, random_state=42))
etbag_reg.fit(X_train_s, y_train)

etbag_pred = etbag_reg.predict(X_train_s)
etbag_mse = mean_squared_error(y_train, etbag_pred)
etbag_r2 = r2_score(y_train, etbag_pred)

etbag_pred_test = etbag_reg.predict(X_test_s)
etbag_mse_test = mean_squared_error(y_test, etbag_pred_test)
etbag_r2_test = r2_score(y_test, etbag_pred_test)


print(etbag_mse, etbag_r2, etbag_mse_test, etbag_r2_test)
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes[0].scatter(y_train, etbag_pred, alpha = 0.02)
axes[0].plot((0, 0.4), (0, 0.4))
axes[1].scatter(y_test, etbag_pred_test, alpha = 0.05)
axes[1].plot((0, 0.4), (0, 0.4))

### Predicting P01 using the adjusted model

In [None]:
df01_og = pd.read_pickle('P01.pkl')
df01_og    # WMSD after KDE

In [None]:
X01 = df01_og[['Radius', 'ANT', 'GXYX', 'CURVE']]
X01_s = StandardScaler().fit_transform(X01)
X01_s

In [None]:
rf_pred_01 = rf_reg.predict(X01_s)
df01_rf_pred = df01_og.copy()
df01_rf_pred['WellMicroSeismicData'] = rf_pred_01
np.savetxt('P01_rf_pred.csv', df01_rf_pred, fmt='%s')
df01_rf_pred

In [None]:
etbag_pred_01 = etbag_reg.predict(X01_s)
df01_etbag_pred = df01_og.copy()
df01_etbag_pred['WellMicroSeismicData'] = etbag_pred_01
np.savetxt('P01_etbag_pred.csv', df01_etbag_pred, fmt='%s')
df01_etbag_pred

In [None]:
plt.scatter(rf_pred_01, etbag_pred_01, alpha = 0.05)
plt.plot((0, 0.4), (0, 0.4))