In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# DO NOT run this every time!
df_orig = pd.read_table('a.awm', sep='\s+', engine='python')
df_orig.to_pickle('a.pkl')

In [None]:
df_og = pd.read_pickle('a.pkl')
df_og    # Wrangler: head + info + null + hist

In [None]:
kde_w = pd.read_csv('density_0.9-3.csv', header = None, names = ['WellMicroSeismicData'])
kde_w    # Wrangler: head + info + null + hist

In [None]:
df0 = df_og[['Z', 'ANT', 'GXYX', 'CURVE']].copy()    # df with KDE WellMicroSeismicData
df0 = df0.join(kde_w)
df0

In [None]:
df0.describe()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
axes[0].scatter(df0['Z'], df0['WellMicroSeismicData'], alpha=0.1)
axes[1].scatter(df0['ANT'], df0['WellMicroSeismicData'], alpha=0.1)
axes[2].scatter(df0['GXYX'], df0['WellMicroSeismicData'], alpha=0.1)
axes[3].scatter(df0['CURVE'], df0['WellMicroSeismicData'], alpha=0.1)

In [None]:
df = df0.copy()
df['WellMicroSeismicData'] = df['WellMicroSeismicData'].apply(lambda x: x if x == 0 else 1)

In [None]:
df.corr()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set[['ANT', 'GXYX', 'CURVE']].copy()    # X_train: feature vairables in training dataset
y_train = train_set['WellMicroSeismicData'].copy()    # y_train : response variable in training dataset
X_test = test_set[['ANT', 'GXYX', 'CURVE']].copy()    # X_test: feature vairables in testing dataset
y_test = test_set['WellMicroSeismicData'].copy()    # y_test : response variable in testing dataset

X_train

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
X_train_s = MinMaxScaler().fit_transform(X_train)
X_test_s = MinMaxScaler().fit_transform(X_test)
X_train_s
X_test_s

In [None]:
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42, penalty='l2')
sgd_clf.fit(X_train_s, y_train)

sgd_predictions = sgd_clf.predict(X_train_s)
sgd_mse = mean_squared_error(y_train, sgd_predictions)
sgd_conf = confusion_matrix(y_train, sgd_predictions)

#sgd_cv_predictions = cross_val_predict(sgd_clf, X_train_s, y_train, cv=5)
#sgd_cv_conf = confusion_matrix(y_train, sgd_cv_predictions)

sgd_mse, sgd_conf#, tree_cv_conf

In [None]:
sgd_predictions_test = sgd_clf.predict(X_test_s)
sgd_conf_test = confusion_matrix(y_test, sgd_predictions_test)
sgd_conf_test

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='l2')
log_reg.fit(X_train_s, y_train)

log_predictions = log_reg.predict(X_train_s)
log_mse = mean_squared_error(y_train, log_predictions)
log_conf = confusion_matrix(y_train, log_predictions)

#log_cv_predictions = cross_val_predict(log_reg, X_train_s, y_train, cv=5)
#log_cv_conf = confusion_matrix(y_train, log_cv_predictions)

log_mse, log_conf#, log_cv_conf

In [None]:
log_predictions_test = log_reg.predict(X_test_s)
log_conf_test = confusion_matrix(y_test, log_predictions_test)
log_conf_test

In [None]:
from sklearn.svm import LinearSVC

svm_clf = Pipeline([
    ('scalar', StandardScaler()),
    ('linear_svm', LinearSVC(C=1, loss='hinge'))
    ])
svm_clf.fit(X_train_s, y_train)

svm_predictions = svm_clf.predict(X_train_s)
svm_mse = mean_squared_error(y_train, svm_predictions)
svm_conf = confusion_matrix(y_train, svm_predictions)

#svm_cv_predictions = cross_val_predict(svm_clf, X_train_s, y_train, cv=5)
#svm_cv_conf = confusion_matrix(y_train, svm_cv_predictions)

svm_mse, svm_conf#, svm_cv_conf

In [None]:
svm_predictions_test = svm_clf.predict(X_test_s)
svm_conf_test = confusion_matrix(y_test, svm_predictions_test)
svm_conf_test

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train_s, y_train)

tree_predictions = tree_reg.predict(X_train_s)
tree_mse = mean_squared_error(y_train, tree_predictions)
tree_conf = confusion_matrix(y_train, tree_predictions)

#tree_cv_predictions = cross_val_predict(tree_reg, X_train_s, y_train, cv=5)
#tree_cv_conf = confusion_matrix(y_train, tree_cv_predictions)

tree_mse, tree_conf#, tree_cv_conf

In [None]:
tree_predictions_test = tree_reg.predict(X_test_s)
tree_conf_test = confusion_matrix(y_test, tree_predictions_test)
tree_conf_test

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_s, y_train)

tree_predictions = tree_clf.predict(X_train_s)
tree_mse = mean_squared_error(y_train, tree_predictions)
tree_conf = confusion_matrix(y_train, tree_predictions)

#tree_cv_predictions = cross_val_predict(tree_clf, X_train_s, y_train, cv=5)
#tree_cv_conf = confusion_matrix(y_train, tree_cv_predictions)

tree_mse, tree_conf#, tree_cv_conf

In [None]:
tree_predictions_test = tree_clf.predict(X_test_s)
tree_conf_test = confusion_matrix(y_test, tree_predictions_test)
tree_conf_test

In [None]:
tree_clf.tree_.max_depth

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc

def train_and_evaluate(max_depth=None):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    clf.fit(X_train_s, y_train)
    y_pred_train = clf.predict(X_train_s)
    y_pred_test = clf.predict(X_test_s)
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    train_recall = recall_score(y_train, y_pred_train)
    test_recall = recall_score(y_test, y_pred_test)
    
    return train_accuracy, test_accuracy, train_recall, test_recall

In [None]:
max_depths = [1, 50, 100, 120, 131]
train_accuracies = []
test_accuracies = []
train_recalls = []
test_recalls = []

for i in max_depths:
    a, b, c, d = train_and_evaluate(max_depth=i)
    train_accuracies.append(a)
    test_accuracies.append(b)
    train_recalls.append(c)
    test_recalls.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_accuracies, label='Training Accuracy')
plt.plot(max_depths, test_accuracies, label='Test Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_recalls, label='Training Recall')
plt.plot(max_depths, test_recalls, label='Test Recall')
plt.xlabel('Maximum Depth')
plt.ylabel('Recall')
plt.title('Recall vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.savefig("accuracy_recall_plot.webp")
plt.show()

train_accuracies, test_accuracies, train_recalls, test_recalls

In [None]:
max_depths = [40, 45, 50, 55, 60]
train_accuracies = []
test_accuracies = []
train_recalls = []
test_recalls = []

for i in max_depths:
    a, b, c, d = train_and_evaluate(max_depth=i)
    train_accuracies.append(a)
    test_accuracies.append(b)
    train_recalls.append(c)
    test_recalls.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_accuracies, label='Training Accuracy')
plt.plot(max_depths, test_accuracies, label='Test Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_recalls, label='Training Recall')
plt.plot(max_depths, test_recalls, label='Test Recall')
plt.xlabel('Maximum Depth')
plt.ylabel('Recall')
plt.title('Recall vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.savefig("accuracy_recall_plot.webp")
plt.show()

train_accuracies, test_accuracies, train_recalls, test_recalls

In [None]:
max_depths = [20, 25, 30, 35, 40]
train_accuracies = []
test_accuracies = []
train_recalls = []
test_recalls = []

for i in max_depths:
    a, b, c, d = train_and_evaluate(max_depth=i)
    train_accuracies.append(a)
    test_accuracies.append(b)
    train_recalls.append(c)
    test_recalls.append(d)


plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, train_accuracies, label='Training Accuracy')
plt.plot(max_depths, test_accuracies, label='Test Accuracy')
plt.xlabel('Maximum Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Maximum Depth')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(max_depths, train_recalls, label='Training Recall')
plt.plot(max_depths, test_recalls, label='Test Recall')
plt.xlabel('Maximum Depth')
plt.ylabel('Recall')
plt.title('Recall vs Maximum Depth')
plt.legend()

plt.tight_layout()
plt.savefig("accuracy_recall_plot.webp")
plt.show()

train_accuracies, test_accuracies, train_recalls, test_recalls

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=25)
tree_clf.fit(X_train_s, y_train)

tree_predictions = tree_clf.predict(X_train_s)
tree_mse = mean_squared_error(y_train, tree_predictions)
tree_conf = confusion_matrix(y_train, tree_predictions)

#tree_cv_predictions = cross_val_predict(tree_clf, X_train_s, y_train, cv=5)
#tree_cv_conf = confusion_matrix(y_train, tree_cv_predictions)

tree_mse, tree_conf#, tree_cv_conf

In [None]:
tree_predictions_test = tree_clf.predict(X_test_s)
tree_conf_test = confusion_matrix(y_test, tree_predictions_test)
tree_conf_test

In [None]:
parameter = {
    'criterion' :['entropy','gini','log_loss'],    # log_loss
    'splitter':['best','random'],    # random
    'max_features':['sqrt','log2']    # log2
}
model = DecisionTreeClassifier(max_depth=25)
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(model,param_grid = parameter,cv = 5)
cv.fit(X_train_s,y_train)

cv.score(X_test_s,y_test)
cv.best_params_

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(criterion='log_loss', splitter='best', max_features='log2', max_depth=25)
tree_clf.fit(X_train_s, y_train)

tree_predictions = tree_clf.predict(X_train_s)
tree_mse = mean_squared_error(y_train, tree_predictions)
tree_conf = confusion_matrix(y_train, tree_predictions)

#tree_cv_predictions = cross_val_predict(tree_clf, X_train_s, y_train, cv=5)
#tree_cv_conf = confusion_matrix(y_train, tree_cv_predictions)

tree_predictions_test = tree_clf.predict(X_test_s)
tree_conf_test = confusion_matrix(y_test, tree_predictions_test)

tree_mse, tree_conf, tree_conf_test, #, tree_cv_conf

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(
        tree_clf,
        out_file="tree_clf.dot",
        rounded=True,
        filled=True
    )

In [None]:
tree_clf.tree_.compute_feature_importances()

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_train_s)
plt.plot(pca.explained_variance_ratio_)
pca.explained_variance_ratio_

In [None]:
pca_2d = PCA(n_components=2)
X_train_2d = pca_2d.fit_transform(X_train_s)

colour = ['r', 'b']
for l, c in zip(np.unique(y_train), colour):
    plt.scatter(X_train_2d[y_train == l, 0], X_train_2d[y_train == l,1],
                c=c, label=l, alpha=0.1)


In [None]:
pca.components_