In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time

In [5]:
X = np.genfromtxt('../data/X.csv', delimiter=',')
Y = np.genfromtxt('../data/Y.csv', delimiter=',')

X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [6]:
# grid search results (hyper-values): 

lr = LogisticRegression(C=2, penalty='l1')

mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(100,), activation='identity', 
                            solver='sgd', alpha=0.01, learning_rate='adaptive')

gb = GradientBoostingClassifier(max_depth=3,loss="deviance",learning_rate=0.160222,
                                 n_estimators=1000,min_samples_split=20,min_samples_leaf=9,
                                 max_features="sqrt", subsample=0.637095)

In [7]:
print("lr training started")
lr.fit(Xtr,Ytr)
print("lr training finished")

print("mlp training started")
mlp.fit(Xtr,Ytr)
print("mlp training finished")

print("gb training started")
gb.fit(Xtr,Ytr)
print("gb training finished")

lr training started
lr training finished
mlp training started
mlp training finished
gb training started
gb training finished


## Classification Report of Different Classifiers

In [8]:
target_names = ['negative', 'positive']

In [12]:
lr_pred = lr.predict(X_test)

print("Logistic Regression Classifier Report")
print(classification_report(Y_test, lr_pred, target_names=target_names))

Logistic Regression Classifier Report
              precision    recall  f1-score   support

    negative       0.69      0.51      0.59      8636
    positive       0.69      0.83      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [13]:
mlp_pred = mlp.predict(X_test)

print("Neural Network Classifier Report")
print(classification_report(Y_test, mlp_pred, target_names=target_names))

Neural Network Classifier Report
              precision    recall  f1-score   support

    negative       0.69      0.51      0.59      8636
    positive       0.69      0.83      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [14]:
gb_pred = gb.predict(X_test)

print("Gradient Boosting Classifier Report")
print(classification_report(Y_test, gb_pred, target_names=target_names))

Gradient Boosting Classifier Report
              precision    recall  f1-score   support

    negative       0.69      0.52      0.59      8636
    positive       0.69      0.82      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



## Check the similarity of the results predicted by diffierent classifiers

In [15]:
lr_pred_1 = lr_pred[lr_pred==1]
lr_pred_0 = lr_pred[lr_pred==0]

mlp_pred_1 = mlp_pred[mlp_pred==1]
mlp_pred_0 = mlp_pred[mlp_pred==0]

gb_pred_1 = gb_pred[gb_pred==1]
gb_pred_0 = gb_pred[gb_pred==0]

In [20]:
# lr vs. mlp
print("Similarity: ")

print("\tlr  vs. mlp:\t%.4f" % ((lr_pred[lr_pred==mlp_pred].shape[0])/(lr_pred.shape[0])))
print("\tlr  vs. gb: \t%.4f" % ((lr_pred[lr_pred==gb_pred].shape[0])/(lr_pred.shape[0])))
print("\tmlp vs. gb: \t%.4f" % ((mlp_pred[mlp_pred==gb_pred].shape[0])/(mlp_pred.shape[0])))

Similarity: 
	lr  vs. mlp:	0.9887
	lr  vs. gb: 	0.9653
	mlp vs. gb: 	0.9617


In [24]:
lr_pred_1.shape

(13603,)

In [25]:
mlp_pred_1.shape

(13582,)

In [26]:
gb_pred_1.shape

(13459,)

## classifiers with default hyper parameters

In [27]:
gb_default = GradientBoostingClassifier()

gb_default.fit(Xtr,Ytr)

gb_default_pred = gb_default.predict(X_test)

print(classification_report(Y_test, gb_default_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.76      0.36      0.49      8636
    positive       0.65      0.92      0.76     11362

    accuracy                           0.67     19998
   macro avg       0.71      0.64      0.62     19998
weighted avg       0.70      0.67      0.64     19998



In [28]:
lr_default = LogisticRegression()

lr_default.fit(Xtr,Ytr)

lr_default_pred = lr_default.predict(X_test)

print(classification_report(Y_test, lr_default_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.69      0.51      0.59      8636
    positive       0.69      0.83      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [32]:
print("lr  vs. lr_default: \t%.4f" % ((lr_pred[lr_pred==lr_default_pred].shape[0])/(lr_pred.shape[0])))

lr  vs. lr_default: 	0.9993


In [33]:
print("gb  vs. gb_default: \t%.4f" % ((gb_pred[gb_pred==gb_default_pred].shape[0])/(gb_pred.shape[0])))

gb  vs. gb_default: 	0.8570
