In [1]:
#constants
x_train_data_path = "../data/processed/3_x_train.pkl"
y_train_data_path = "../data/processed/3_y_train_label.pkl"
x_test_data_path = "../data/processed/3_x_test.pkl"
y_test_data_path = "../data/processed/3_y_test_label.pkl"

MODEL_PATH = "../models/"

In [2]:
# Load packages
import pandas as pd 
import numpy as np

from pathlib import Path
import logging
import pickle
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
 
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score,confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

from imblearn.over_sampling import RandomOverSampler,SMOTE

In [3]:
# Load train and test data
x_train = pd.read_pickle(x_train_data_path)
y_train = pd.read_pickle(y_train_data_path)
x_test = pd.read_pickle(x_test_data_path)
y_test = pd.read_pickle(y_test_data_path)

In [4]:
# create a list of numerical and categorical columns
x_cat_cols = list(x_train.select_dtypes(include=['object']).columns)
x_num_cols = list(x_train.select_dtypes(exclude=['object']).columns)

## Transformation pipeline

In [5]:
# create pipeline for numerical attributes, added simpleimputer to impute missing data in case of new data that has missing values.
num_pipeline = make_pipeline(SimpleImputer(strategy='median'),
                               StandardScaler())
 
# create pipeline for categorical attributes                            
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                               OneHotEncoder(handle_unknown = "ignore"))

# Combine both num and cate pipeline using column transformer
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, x_num_cols),
    ("cat", cat_pipeline, x_cat_cols),
    ])


In [6]:
# apply all transformations
x_train_pipe = full_pipeline.fit_transform(x_train)
x_train_pipe

<80000x73 sparse matrix of type '<class 'numpy.float64'>'
	with 1440000 stored elements in Compressed Sparse Row format>

In [7]:
x_train_pipe.shape

(80000, 73)

## Select and Train a Model

- **Models to try:**
    - **Logistic Regression**
    - **Random Forest**
    - **K-Nearest Neigbors**

In [9]:
# Logisitic Regression classifier
log_clf = LogisticRegression(max_iter = 1000)
log_clf.fit(x_train_pipe, y_train)

In [11]:
# K-Nearest Neigbor classifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train_pipe,y_train)

In [12]:
# Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train_pipe, y_train)

In [15]:
log_clf_accuracy = cross_val_score(log_clf,x_train_pipe,y_train, cv=3, scoring ='accuracy')
log_clf_f1 = cross_val_score(log_clf,x_train_pipe,y_train, cv=3, scoring ='f1')

print('log_clf_accuracy: ' + str(log_clf_accuracy)+ ', mean: ' + str(log_clf_accuracy.mean()))
print('log_clf_f1: '+ str(log_clf_f1)+', mean: ' +str(log_clf_f1.mean()))

log_clf_accuracy: [0.85997675 0.85851427 0.85955899], mean: 0.8593500026123295
log_clf_f1: [0.25290116 0.260631   0.26323038], mean: 0.25892084586609093


In [16]:
knn_clf_accuracy = cross_val_score(knn_clf,x_train_pipe,y_train, cv=3, scoring ='accuracy')
knn_clf_f1 = cross_val_score(knn_clf,x_train_pipe,y_train, cv=3, scoring ='f1')

print('knn_clf_accuracy: ' + str(knn_clf_accuracy)+ ', mean: ' + str(knn_clf_accuracy.mean()))
print('knn_clf_f1: '+ str(knn_clf_f1)+', mean: ' +str(knn_clf_f1.mean()))

knn_clf_accuracy: [0.83050212 0.82701466 0.83143329], mean: 0.8296500222907944
knn_clf_f1: [0.27726255 0.27775168 0.2767498 ], mean: 0.27725467798224784


In [24]:
rf_clf_accuracy = cross_val_score(rf_clf,x_train_pipe,y_train, cv=3, scoring ='accuracy')
rf_clf_f1 = cross_val_score(rf_clf,x_train_pipe,y_train, cv=3, scoring ='f1')

print('rf_clf_accuracy: ' + str(rf_clf_accuracy)+ ', mean: ' + str(rf_clf_accuracy.mean()))
print('rf_clf_f1: '+ str(rf_clf_f1)+', mean: ' +str(rf_clf_f1.mean()))

rf_clf_accuracy: [0.85022687 0.84853939 0.85115878], mean: 0.8499750147970522
rf_clf_f1: [0.22668748 0.21920991 0.21879327], mean: 0.22156355479846646


In [25]:
# save models to easy load again
model = {'log_clf': log_clf}
with open(os.path.join(MODEL_PATH, 'log_clf_0.1.pkl'), "wb") as output_file:
    pickle.dump(model, output_file)

In [26]:
# save models to easy load again
model = {'knn_clf': knn_clf}
with open(os.path.join(MODEL_PATH, 'knn_clf_0.1.pkl'), "wb") as output_file:
    pickle.dump(model, output_file)

In [27]:
# save models to easy load again
model = {'rf_clf': rf_clf}
with open(os.path.join(MODEL_PATH, 'rf_clf_0.1.pkl'), "wb") as output_file:
    pickle.dump(model, output_file)

- **Since we are dealing with imbalanced data, It would be a good idea to try:**
    - **Smote over sampling with the best model**
    - **Random over sampling with the best model**

In [8]:
# smote over sampling
over = SMOTE()
x_over, y_over = over.fit_resample(x_train_pipe, y_train)

In [9]:
# let's use it on logisitic Regression, since it take no time to train
log_clf_over = LogisticRegression(max_iter=1000)
log_clf_over.fit(x_over,y_over)

In [10]:
# K-Nearest Neigbor classifier
knn_clf_over = KNeighborsClassifier(n_neighbors=3)
knn_clf_over.fit(x_over,y_over)

In [11]:
# Random Forest Classifier
rf_clf_over = RandomForestClassifier()
rf_clf_over.fit(x_over, y_over)

In [40]:
log_clf_over_accuracy = cross_val_score(log_clf_over,x_over,y_over, cv=3, scoring ='accuracy')
log_clf_over_f1 = cross_val_score(log_clf_over,x_over,y_over, cv=3, scoring ='f1')

print('log_clf_over_accuracy: ' + str(log_clf_over_accuracy)+ ', mean: ' + str(log_clf_over_accuracy.mean()))
print('log_clf_over_f1: '+ str(log_clf_over_f1)+', mean: ' +str(log_clf_over_f1.mean()))

log_clf_over_accuracy: [0.79468657 0.80256315 0.80701945], mean: 0.801423058374149
log_clf_over_f1: [0.80536304 0.81537397 0.81907016], mean: 0.8132690550882739


In [None]:
knn_clf_over_accuracy = cross_val_score(knn_clf_over,x_over,y_over, cv=3, scoring ='accuracy')
knn_clf_over_f1 = cross_val_score(knn_clf_over,x_over,y_over, cv=3, scoring ='f1')

print('knn_clf_over_accuracy: ' + str(knn_clf_over_accuracy)+ ', mean: ' + str(knn_clf_over_accuracy.mean()))
print('knn_clf_over_f1: '+ str(knn_clf_over_f1)+', mean: ' +str(knn_clf_over_f1.mean()))

In [None]:
rf_clf_over_accuracy = cross_val_score(rf_clf_over,x_over,y_over, cv=3, scoring ='accuracy')
rf_clf_over_f1 = cross_val_score(rf_clf_over,x_over,y_over, cv=3, scoring ='f1')

print('rf_clf_over_accuracy: ' + str(rf_clf_over_accuracy)+ ', mean: ' + str(rf_clf_over_accuracy.mean()))
print('rf_clf_over_f1: '+ str(rf_clf_over_f1)+', mean: ' +str(rf_clf_over_f1.mean()))

## Fine Tuning the best model

In [None]:
# RandomForest finetuning

# create the param grid of features you want to search
rf_param_grid = [
    {'n_estimators':[10,20,30,40,50], 'max_features': [4,6,8,10]},
    {'bootstrap': [False], 'n_estimators': [10,20,30,40,50], 'max_features': [4,6,8,10]},
]

rf_clf_over_grid_search = GridSearchCV(rf_clf, forest_param_grid, cv= 3, scoring='f1', return_train_score=True,verbose=2)

rf_cld_over_grid_search.fit(x_train_pipe, y_train)

In [None]:
rf_clf_grid_search.best_estimator_

## Evaluate Your System on the Test set

In [12]:
# apply transformation on the test set
x_test_pipe = full_pipeline.transform(x_test)

In [13]:
pred_log_clf_over = log_clf_over.predict(x_test_pipe)
pred_knn_clf_over = knn_clf_over.predict(x_test_pipe)
pred_rf_clf_over = rf_clf_over.predict(x_test_pipe)

In [18]:
print('The accuracy of the Logistic Regression is',accuracy_score(pred_log_clf_over,y_test))
print(classification_report(pred_log_clf_over, y_test))

The accuracy of the Logistic Regression is 0.75925
              precision    recall  f1-score   support

           0       0.74      0.97      0.84     13199
           1       0.85      0.36      0.50      6801

    accuracy                           0.76     20000
   macro avg       0.80      0.66      0.67     20000
weighted avg       0.78      0.76      0.73     20000



In [19]:
print('The accuracy of the k-Nearest neigbors is',accuracy_score(pred_knn_clf_over,y_test))
print(classification_report(pred_knn_clf_over, y_test))

The accuracy of the k-Nearest neigbors is 0.76635
              precision    recall  f1-score   support

           0       0.80      0.92      0.85     14859
           1       0.58      0.32      0.42      5141

    accuracy                           0.77     20000
   macro avg       0.69      0.62      0.64     20000
weighted avg       0.74      0.77      0.74     20000



In [21]:
print('The accuracy of the Random forest is',accuracy_score(pred_rf_clf_over,y_test))
print(classification_report(pred_rf_clf_over, y_test))

The accuracy of the Random forest is 0.803
              precision    recall  f1-score   support

           0       0.86      0.91      0.88     16232
           1       0.47      0.36      0.41      3768

    accuracy                           0.80     20000
   macro avg       0.66      0.63      0.64     20000
weighted avg       0.79      0.80      0.79     20000



In [24]:
conf_mx = confusion_matrix(y_test , pred_log_clf_over)
conf_mx

array([[12759,  4375],
       [  440,  2426]], dtype=int64)

 - **`Logistic Regression` is the best model, as:**
     - **It has the highest F1 score and accuracy compared to other models.**

## Import the Best Model

In [23]:
import pickle
pickl = {'log_clf_over': log_clf_over}
with open(os.path.join(MODEL_PATH, 'log_clf_over_0.1.0.pkl'), "wb") as output_file:
    pickle.dump(pickl, output_file)

## Things to try next

- **Try `Random over sampling` and compare it with `Smote over sampling`**
- **Try different ensemble methods**
- **Try PCA with the best model**
- **Analyze the best models and their errors, feature importances and remove useless features**
- **Refactor the code into proper scripts, instead of notebooks**
- **Build an api to recieve data, do the whole process of preprocessing,transformation and return prediction**