In [1]:
import numpy as np
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
import sys
sys.path.append('..')
from utils.func.functions import build_x
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from joblib import dump, load
import xgboost as xgb

In [2]:
import os
path_to_storage = os.path.abspath(os.path.join(os.getcwd(), '../storage'))

### For Google colab (chage dir from local to GDrive)
### Mount gdrive and set path to folder
# from google.colab import drive
# drive.mount('/content/gdrive')
# path_to_storage = '/content/gdrive/My Drive/UCU-2019-final-project-storage'

data_folder = path_to_storage+'/data/'
serialization_objects_folder = path_to_storage+'/serialization_objects/'

In [3]:
X_train = pickle.load(open(serialization_objects_folder+'X_train.p', 'rb'))
y_train = pickle.load(open(serialization_objects_folder+'y_train.p', 'rb'))

X_test = pickle.load(open(serialization_objects_folder+'X_test.p', 'rb'))
y_test = pickle.load(open(serialization_objects_folder+'y_test.p', 'rb'))

In [4]:
X_train_final = build_x(X_train, data_type='train',data_folder=serialization_objects_folder)
y_train_final =  y_train.loc[X_train_final.index]

X_test_final = build_x(X_test, data_type='test',data_folder=serialization_objects_folder)
y_test_final =  y_test.loc[X_test_final.index]

target_names = ['not duplicate', 'duplicate']

del X_train, y_train, X_test, y_test

In [5]:
X_train_final.head()

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,manhattan,minkowski,sqeuclidean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
243973,1.951959,0.846303,0.786173,0.785448,208.590857,14.06659,156.863274,11.348117,156.863274,11.348117,156.863274,11.348117,161.186004
82523,2.728833,0.824428,0.805639,0.805558,192.825788,12.275682,220.302082,15.97149,220.302082,15.97149,220.302082,15.970894,292.239386
373083,2.212937,0.829389,0.80507,0.804911,197.535168,8.169445,174.928238,12.749504,174.928238,12.749504,174.928238,12.749175,191.428591
145241,2.957288,0.760158,0.690485,0.688195,192.530215,14.008953,225.627409,16.318958,225.627409,16.318958,225.627409,16.31879,295.532763
227393,3.150512,0.753183,0.749205,0.748741,174.930979,17.595565,240.385828,17.358189,240.385828,17.358189,240.385828,17.357826,359.16503


In [6]:
X_train_final.shape, y_train_final.shape

((270823, 13), (270823,))

In [8]:
X_test_final.shape, y_test_final.shape

((133397, 13), (133397,))

### Logistic Regression

In [None]:
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train_final, y_train_final)

dump(logr_cv, serialization_objects_folder+'logr_ramdomcv.joblib')

In [None]:
logr_cv = load(input_folder+'logr_ramdomcv.joblib')

In [None]:
logr_cv.best_params_

#### Evaluate

In [None]:
from sklearn.linear_model import LogisticRegression
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

In [None]:
logr_pred = logr_model.predict(X_test_final)

In [None]:
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))
print(classification_report(y_test_final, logr_pred, target_names=target_names))

In [None]:
logr_fpr, logr_tpr, _ = roc_curve(y_test_final, logr_pred)
logr_roc_auc = auc(logr_fpr, logr_tpr)

plot_roc(logr_fpr, logr_tpr, logr_roc_auc)

### Gradient Boosted Machine (XGBoost)

In [None]:
params_xgb = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'gamma':np.linspace(.01, 1, 10, endpoint=True), 
               'learning_rate' : np.linspace(.01, 1, 10, endpoint=True),
               'reg_lambda': np.linspace(0.01, 10, 20, endpoint=True),
               'max_depth' : np.linspace(1, 32, 32, endpoint=True, dtype=int)
                 }
cv_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=params_xgb, cv=5, n_jobs=3, random_state=42)
cv_xgb.fit(X_train_final, y_train_final)
dump(cv_xgb, serialization_objects_folder+'xgb_ramdomcv.joblib')



In [None]:
cv_xgb = load(serialization_objects_folder+'xgb_ramdomcv.joblib')

In [None]:
cv_xgb.best_params_

#### Evaluate

In [None]:
clf_xgb_model = xgb.XGBClassifier(random_state=42,
                                  objective='binary:logistic',
                                  n_estimators=cv_xgb.best_params_['n_estimators'],
                                  gamma=cv_xgb.best_params_['gamma'],
                                  learning_rate=cv_xgb.best_params_['learning_rate'],
                                  reg_lambda=cv_xgb.best_params_['reg_lambda'],
                                  max_depth=cv_xgb.best_params_['max_depth'],
                                  n_jobs=-1)

t0 = time.time()
clf_xgb_model.fit(X_train_final, y_train_final)
t1 = time.time() - t0
print(t1)

In [None]:
score_xgb = accuracy_score(y_test_final, y_pred_xgb)
rscore_xgb = recall_score(y_test_final, y_pred_xgb)
pscore_xgb = precision_score(y_test_final, y_pred_xgb)
print('Accuracy score for XGBoost ', score_xgb)
print('Recall score for XGBoost ', rscore_xgb)
print('Precision score for XGBoost ', pscore_xgb)
print(classification_report(y_test_final, y_pred_xgb, target_names=target_names))

In [None]:
xgb_fpr, xgb_tpr, _ = roc_curve(y_test_final, y_pred_xgb)
xgb_roc_auc = auc(xgb_fpr, xgb_tpr)

plot_roc(xgb_fpr, xgb_tpr, xgb_roc_auc)