In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time

In [3]:
import scipy

X = load = scipy.sparse.load_npz('../new_data/X_sparse.npz')
Y = np.genfromtxt('../new_data/Y.csv', delimiter=',')[1:]

X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [4]:
# grid search results (hyper-values): 

lr = LogisticRegression(C=0.876839, penalty='l1')

mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(200, 50, 50), activation='relu', 
                            solver='lbfgs', alpha=0.0001, learning_rate='constant')

gb = GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                n_estimators=1500, max_depth=3,
                                min_samples_split=6,min_samples_leaf=1,
                                max_features='sqrt', subsample=0.95)

In [5]:
print("lr training started")
lr.fit(Xtr,Ytr)
print("lr training finished")

print("mlp training started")
mlp.fit(Xtr,Ytr)
print("mlp training finished")

print("gb training started")
gb.fit(Xtr,Ytr)
print("gb training finished")

lr training started
lr training finished
mlp training started
mlp training finished
gb training started
gb training finished


## Classification Report of Different Classifiers

In [6]:
target_names = ['negative', 'positive']

In [7]:
lr_pred = lr.predict(X_test)

print("Logistic Regression Classifier Report")
print(classification_report(Y_test, lr_pred, target_names=target_names))

Logistic Regression Classifier Report
              precision    recall  f1-score   support

    negative       0.78      0.73      0.75     10031
    positive       0.74      0.79      0.77      9969

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000



In [8]:
mlp_pred = mlp.predict(X_test)

print("Neural Network Classifier Report")
print(classification_report(Y_test, mlp_pred, target_names=target_names))

Neural Network Classifier Report
              precision    recall  f1-score   support

    negative       0.72      0.72      0.72     10031
    positive       0.72      0.72      0.72      9969

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000



In [9]:
gb_pred = gb.predict(X_test)

print("Gradient Boosting Classifier Report")
print(classification_report(Y_test, gb_pred, target_names=target_names))

Gradient Boosting Classifier Report
              precision    recall  f1-score   support

    negative       0.79      0.72      0.75     10031
    positive       0.74      0.80      0.77      9969

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000



## Check the similarity of the results predicted by diffierent classifiers

In [10]:
lr_pred_1 = lr_pred[lr_pred==1]
lr_pred_0 = lr_pred[lr_pred==0]

mlp_pred_1 = mlp_pred[mlp_pred==1]
mlp_pred_0 = mlp_pred[mlp_pred==0]

gb_pred_1 = gb_pred[gb_pred==1]
gb_pred_0 = gb_pred[gb_pred==0]

In [11]:
# lr vs. mlp
print("Similarity: ")

print("\tlr  vs. mlp:\t%.4f" % ((lr_pred[lr_pred==mlp_pred].shape[0])/(lr_pred.shape[0])))
print("\tlr  vs. gb: \t%.4f" % ((lr_pred[lr_pred==gb_pred].shape[0])/(lr_pred.shape[0])))
print("\tmlp vs. gb: \t%.4f" % ((mlp_pred[mlp_pred==gb_pred].shape[0])/(mlp_pred.shape[0])))

Similarity: 
	lr  vs. mlp:	0.8075
	lr  vs. gb: 	0.9345
	mlp vs. gb: 	0.7932


In [12]:
lr_pred_1.shape

(10557,)

In [13]:
mlp_pred_1.shape

(9976,)

In [14]:
gb_pred_1.shape

(10832,)

## classifiers with default hyper parameters

In [15]:
gb_default = GradientBoostingClassifier()

gb_default.fit(Xtr,Ytr)

gb_default_pred = gb_default.predict(X_test)

print(classification_report(Y_test, gb_default_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.78      0.54      0.64     10031
    positive       0.65      0.85      0.73      9969

    accuracy                           0.69     20000
   macro avg       0.72      0.69      0.69     20000
weighted avg       0.72      0.69      0.69     20000



In [16]:
lr_default = LogisticRegression()

lr_default.fit(Xtr,Ytr)

lr_default_pred = lr_default.predict(X_test)

print(classification_report(Y_test, lr_default_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.77      0.74      0.75     10031
    positive       0.75      0.78      0.76      9969

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000



In [17]:
print("lr  vs. lr_default: \t%.4f" % ((lr_pred[lr_pred==lr_default_pred].shape[0])/(lr_pred.shape[0])))

lr  vs. lr_default: 	0.9673


In [18]:
print("gb  vs. gb_default: \t%.4f" % ((gb_pred[gb_pred==gb_default_pred].shape[0])/(gb_pred.shape[0])))

gb  vs. gb_default: 	0.8228


In [None]:
joblib.dump(combined, '../new_data/combinedModel.sav')