<a href="https://colab.research.google.com/github/EmperoR1127/CSI5155_project/blob/master/seismic_bumps_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from scipy.io import arff
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "/content/drive/My Drive/Images/"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

data = arff.loadarff('/content/drive/My Drive/Data/seismic-bumps.arff')
seismic_bumps_data = pd.DataFrame(data[0])
#pre-process the train_set
train_labels = seismic_bumps_data[["class"]].copy()
train_set = seismic_bumps_data.drop(["class"], axis=1)
train_labels["class"] = train_labels["class"].map(lambda x: str(x)[2])
train_set_num = seismic_bumps_data.drop(["seismic","seismoacoustic","shift", "ghazard", "class"], axis=1)
train_set_cat = seismic_bumps_data.drop(["genergy","gpuls","gdenergy", "gdpuls", "nbumps", "nbumps2", "nbumps3", "nbumps4", "nbumps5", "nbumps6", "nbumps7", "nbumps89", "energy", "maxenergy", "class"], axis=1)
#build the pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler()),])
full_pipeline = ColumnTransformer([("num", num_pipeline, list(train_set_num)),("cat", OneHotEncoder(), list(train_set_cat)),])
#prepare the data
train_set_prepared = full_pipeline.fit_transform(train_set)
#prepare the target
encoder = LabelEncoder()
train_labels_prepared = encoder.fit_transform(train_labels)

  y = column_or_1d(y, warn=True)


In [18]:
#rebalance the dataset using 3 different approaches
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.combine import SMOTEENN
#rebalance the dataset using oversampling (random oversampling)
ros = RandomOverSampler(random_state=42)
ros_train_set_prepared, ros_train_labels_prepared = ros.fit_resample(train_set_prepared, train_labels_prepared)
print("Class distribution of oversampling" + str(sorted(Counter(ros_train_labels_prepared).items())))

#rebalance the dataset using undersampling (nearest neightbours)
renn = RepeatedEditedNearestNeighbours()
renn_train_set_prepared, renn_train_labels_prepared = renn.fit_resample(train_set_prepared, train_labels_prepared)
print("Class distribution of undersampling" + str(sorted(Counter(renn_train_labels_prepared).items())))

#rebalance the dataset using balanced sampling (SMOTEENN)
smote_enn = SMOTEENN(random_state=0)
smote_enn_train_set_prepared, smote_enn_train_labels_prepared = smote_enn.fit_resample(train_set_prepared, train_labels_prepared)
print("Class distribution of balanced sampling" + str(sorted(Counter(smote_enn_train_labels_prepared).items())))

Class distribution of oversampling[(0, 2414), (1, 2414)]
Class distribution of undersampling[(0, 1920), (1, 170)]
Class distribution of balanced sampling[(0, 1818), (1, 2194)]


In [22]:
#train the model using DECISION TREE algorithm with 10 fold cross validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
dt_clf = DecisionTreeClassifier()
#train the model on the oversampling dataset
dt_ros_train_prediction = cross_val_predict(dt_clf.fit(ros_train_set_prepared, ros_train_labels_prepared), ros_train_set_prepared, ros_train_labels_prepared, cv=10)
#train the model on the undersampling dataset
dt_renn_train_prediction = cross_val_predict(dt_clf.fit(renn_train_set_prepared, renn_train_labels_prepared), renn_train_set_prepared, renn_train_labels_prepared, cv=10)
#train the model on the balanced sampling dataset
dt_smote_enn_train_prediction = cross_val_predict(dt_clf.fit(smote_enn_train_set_prepared, smote_enn_train_labels_prepared), smote_enn_train_set_prepared, smote_enn_train_labels_prepared, cv=10)

#calculate the presion and recall of oversampling dataset
dt_ros_precision_score = precision_score(ros_train_labels_prepared, dt_ros_train_prediction)
dt_ros_recall_score = recall_score(ros_train_labels_prepared, dt_ros_train_prediction)
print("precision of decision tree model on oversampling dataset is + %f" % dt_ros_precision_score)
print("recall of decision tree model on oversampling dataset is + %f" % dt_ros_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of undersampling dataset
dt_renn_precision_score = precision_score(renn_train_labels_prepared, dt_renn_train_prediction)
dt_renn_recall_score = recall_score(renn_train_labels_prepared, dt_renn_train_prediction)
print("precision of decision tree model on undersampling dataset is + %f" % dt_renn_precision_score)
print("recall of decision tree model on undersampling dataset is + %f" % dt_renn_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of balanced sampling dataset
dt_smote_enn_precision_score = precision_score(smote_enn_train_labels_prepared, dt_smote_enn_train_prediction)
dt_smote_enn_recall_score = recall_score(smote_enn_train_labels_prepared, dt_smote_enn_train_prediction)
print("precision of decision tree model on balanced sampling dataset is + %f" % dt_smote_enn_precision_score)
print("recall of decision tree model on balanced sampling dataset is + %f" % dt_smote_enn_recall_score)

precision of decision tree model on oversampling dataset is + 0.903781
recall of decision tree model on oversampling dataset is + 1.000000
-----------------------------------------
precision of decision tree model on undersampling dataset is + 0.267490
recall of decision tree model on undersampling dataset is + 0.382353
-----------------------------------------
precision of decision tree model on balanced sampling dataset is + 0.893926
recall of decision tree model on balanced sampling dataset is + 0.952598


In [24]:
#train the model using K NEAREST NEIGHBOURS algorithm with 10 fold cross validation
from sklearn import neighbors
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
n_neighbors = 10
knn_clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
#train the model on the oversampling dataset
knn_ros_train_prediction = cross_val_predict(knn_clf.fit(ros_train_set_prepared, ros_train_labels_prepared), ros_train_set_prepared, ros_train_labels_prepared, cv=10)
#train the model on the undersampling dataset
knn_renn_train_prediction = cross_val_predict(knn_clf.fit(renn_train_set_prepared, renn_train_labels_prepared), renn_train_set_prepared, renn_train_labels_prepared, cv=10)
#train the model on the balanced sampling dataset
knn_smote_enn_train_prediction = cross_val_predict(knn_clf.fit(smote_enn_train_set_prepared, smote_enn_train_labels_prepared), smote_enn_train_set_prepared, smote_enn_train_labels_prepared, cv=10)

#calculate the presion and recall of oversampling dataset
knn_ros_precision_score = precision_score(ros_train_labels_prepared, knn_ros_train_prediction)
knn_ros_recall_score = recall_score(ros_train_labels_prepared, knn_ros_train_prediction)
print("precision of k-nearest neighbours model on oversampling dataset is + %f" % knn_ros_precision_score)
print("recall of k-nearest neighbours model on oversampling dataset is + %f" % knn_ros_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of undersampling dataset
knn_renn_precision_score = precision_score(renn_train_labels_prepared, knn_renn_train_prediction)
knn_renn_recall_score = recall_score(renn_train_labels_prepared, knn_renn_train_prediction)
print("precision of k-nearest neighbours model on undersampling dataset is + %f" % knn_renn_precision_score)
print("recall of k-nearest neighbours model on undersampling dataset is + %f" % knn_renn_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of balanced sampling dataset
knn_smote_enn_precision_score = precision_score(smote_enn_train_labels_prepared, knn_smote_enn_train_prediction)
knn_smote_enn_recall_score = recall_score(smote_enn_train_labels_prepared, knn_smote_enn_train_prediction)
print("precision of k-nearest neighbours model on balanced sampling dataset is + %f" % knn_smote_enn_precision_score)
print("recall of k-nearest neighbours model on balanced sampling dataset is + %f" % knn_smote_enn_recall_score)

precision of k-nearest neighbours model on oversampling dataset is + 0.776955
recall of k-nearest neighbours model on oversampling dataset is + 1.000000
-----------------------------------------
precision of k-nearest neighbours model on undersampling dataset is + 0.534091
recall of k-nearest neighbours model on undersampling dataset is + 0.276471
-----------------------------------------
precision of k-nearest neighbours model on balanced sampling dataset is + 0.893399
recall of k-nearest neighbours model on balanced sampling dataset is + 0.993163


In [25]:
#train the model using NAIVE BAYES algorithm with 10 fold cross validation
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
gnb_clf = GaussianNB()
#train the model on the oversampling dataset
gnb_ros_train_prediction = cross_val_predict(gnb_clf.fit(ros_train_set_prepared, ros_train_labels_prepared), ros_train_set_prepared, ros_train_labels_prepared, cv=10)
#train the model on the undersampling dataset
gnb_renn_train_prediction = cross_val_predict(gnb_clf.fit(renn_train_set_prepared, renn_train_labels_prepared), renn_train_set_prepared, renn_train_labels_prepared, cv=10)
#train the model on the balanced sampling dataset
gnb_smote_enn_train_prediction = cross_val_predict(gnb_clf.fit(smote_enn_train_set_prepared, smote_enn_train_labels_prepared), smote_enn_train_set_prepared, smote_enn_train_labels_prepared, cv=10)

#calculate the presion and recall of oversampling dataset
gnb_ros_precision_score = precision_score(ros_train_labels_prepared, gnb_ros_train_prediction)
gnb_ros_recall_score = recall_score(ros_train_labels_prepared, gnb_ros_train_prediction)
print("precision of naive bayes model on oversampling dataset is + %f" % gnb_ros_precision_score)
print("recall of naive bayes model on oversampling dataset is + %f" % gnb_ros_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of undersampling dataset
gnb_renn_precision_score = precision_score(renn_train_labels_prepared, gnb_renn_train_prediction)
gnb_renn_recall_score = recall_score(renn_train_labels_prepared, gnb_renn_train_prediction)
print("precision of naive bayes model on undersampling dataset is + %f" % gnb_renn_precision_score)
print("recall of naive bayes model on undersampling dataset is + %f" % gnb_renn_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of balanced sampling dataset
gnb_smote_enn_precision_score = precision_score(smote_enn_train_labels_prepared, gnb_smote_enn_train_prediction)
gnb_smote_enn_recall_score = recall_score(smote_enn_train_labels_prepared, gnb_smote_enn_train_prediction)
print("precision of naive bayes model on balanced sampling dataset is + %f" % gnb_smote_enn_precision_score)
print("recall of naive bayes model on balanced sampling dataset is + %f" % gnb_smote_enn_recall_score)

precision of naive bayes model on oversampling dataset is + 0.593566
recall of naive bayes model on oversampling dataset is + 0.917150
-----------------------------------------
precision of naive bayes model on undersampling dataset is + 0.118022
recall of naive bayes model on undersampling dataset is + 0.870588
-----------------------------------------
precision of naive bayes model on balanced sampling dataset is + 0.698139
recall of naive bayes model on balanced sampling dataset is + 0.923428


In [26]:
#train the model using RULE BASED algorithm with 10 fold cross validation
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
dc_clf = DummyClassifier(strategy='stratified')
#train the model on the oversampling dataset
dc_ros_train_prediction = cross_val_predict(dc_clf.fit(ros_train_set_prepared, ros_train_labels_prepared), ros_train_set_prepared, ros_train_labels_prepared, cv=10)
#train the model on the undersampling dataset
dc_renn_train_prediction = cross_val_predict(dc_clf.fit(renn_train_set_prepared, renn_train_labels_prepared), renn_train_set_prepared, renn_train_labels_prepared, cv=10)
#train the model on the balanced sampling dataset
dc_smote_enn_train_prediction = cross_val_predict(dc_clf.fit(smote_enn_train_set_prepared, smote_enn_train_labels_prepared), smote_enn_train_set_prepared, smote_enn_train_labels_prepared, cv=10)

#calculate the presion and recall of oversampling dataset
dc_ros_precision_score = precision_score(ros_train_labels_prepared, dc_ros_train_prediction)
dc_ros_recall_score = recall_score(ros_train_labels_prepared, dc_ros_train_prediction)
print("precision of rule based model on oversampling dataset is + %f" % dc_ros_precision_score)
print("recall of rule based model on oversampling dataset is + %f" % dc_ros_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of undersampling dataset
dc_renn_precision_score = precision_score(renn_train_labels_prepared, dc_renn_train_prediction)
dc_renn_recall_score = recall_score(renn_train_labels_prepared, dc_renn_train_prediction)
print("precision of rule based model on undersampling dataset is + %f" % dc_renn_precision_score)
print("recall of rule based model on undersampling dataset is + %f" % dc_renn_recall_score)
print("-----------------------------------------")

#calculate the presion and recall of balanced sampling dataset
dc_smote_enn_precision_score = precision_score(smote_enn_train_labels_prepared, dc_smote_enn_train_prediction)
dc_smote_enn_recall_score = recall_score(smote_enn_train_labels_prepared, dc_smote_enn_train_prediction)
print("precision of rule based model on balanced sampling dataset is + %f" % dc_smote_enn_precision_score)
print("recall of rule based model on balanced sampling dataset is + %f" % dc_smote_enn_recall_score)

precision of rule based model on oversampling dataset is + 0.500207
recall of rule based model on oversampling dataset is + 0.500000
-----------------------------------------
precision of rule based model on undersampling dataset is + 0.071429
recall of rule based model on undersampling dataset is + 0.076471
-----------------------------------------
precision of rule based model on balanced sampling dataset is + 0.542313
recall of rule based model on balanced sampling dataset is + 0.557885
