In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time

In [3]:
X = np.genfromtxt('data/X.csv', delimiter=',')
Y = np.genfromtxt('data/Y.csv', delimiter=',')

X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [43]:
# grid search results (hyper-values): 

lr = LogisticRegression(C=2, penalty='l1')

mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(100,), activation='identity', 
                            solver='sgd', alpha=0.01, learning_rate='adaptive')

gb = GradientBoostingClassifier(max_depth=3,loss="deviance",learning_rate=0.160222,
                                 n_estimators=1000,min_samples_split=20,min_samples_leaf=9,
                                 max_features="sqrt", subsample=0.637095)

In [5]:
print("lr training started")
lr.fit(Xtr,Ytr)
print("lr training finished")

print("mlp training started")
mlp.fit(Xtr,Ytr)
print("mlp training finished")

print("gb training started")
gb.fit(Xtr,Ytr)
print("gb training finished")

lr training started
lr training finished
mlp training started
mlp training finished
gb training started
gb training finished


In [16]:
y_true = Yva
lr_y_pred = lr.predict(Xva)

In [17]:
target_names = ['negative', 'positive']

In [23]:
print("Logistic Regression Classifier Report")
print(classification_report(Y_test, lr.predict(X_test), target_names=target_names))

Logistic Regression Classifier Report
              precision    recall  f1-score   support

    negative       0.69      0.51      0.59      8636
    positive       0.69      0.83      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [25]:
#mlp_y_pred = mlp.predict(Xva)
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(100,), activation='identity', 
                            solver='sgd', alpha=0.01, learning_rate='adaptive')

print("mlp training started")
mlp.fit(Xtr,Ytr)
print("mlp training finished")

print("Neural Network Classifier Report")
print(classification_report(Y_test, mlp.predict(X_test), target_names=target_names))

mlp training started
mlp training finished
Neural Network Classifier Report
              precision    recall  f1-score   support

    negative       0.69      0.51      0.59      8636
    positive       0.69      0.83      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [44]:
#gb_y_true = gb.predict(Xva)
print("gb training started")
gb.fit(Xtr,Ytr)
print("gb training finished")

print("Gradient Boosting Classifier Report")
print(classification_report(Y_test, gb.predict(X_test), target_names=target_names))

gb training started
gb training finished
Gradient Boosting Classifier Report
              precision    recall  f1-score   support

    negative       0.69      0.52      0.59      8636
    positive       0.69      0.82      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [27]:
gb_test = GradientBoostingClassifier()

gb_test.fit(Xtr,Ytr)

print(classification_report(Y_test, gb_test.predict(X_test), target_names=target_names))

              precision    recall  f1-score   support

    negative       0.75      0.36      0.49      8636
    positive       0.65      0.91      0.76     11362

    accuracy                           0.67     19998
   macro avg       0.70      0.64      0.63     19998
weighted avg       0.70      0.67      0.64     19998



In [29]:
lr_test = LogisticRegression()

lr_test.fit(Xtr,Ytr)

print(classification_report(Y_test, lr_test.predict(X_test), target_names=target_names))

              precision    recall  f1-score   support

    negative       0.69      0.51      0.59      8636
    positive       0.69      0.83      0.75     11362

    accuracy                           0.69     19998
   macro avg       0.69      0.67      0.67     19998
weighted avg       0.69      0.69      0.68     19998



In [32]:
b = lr_test.predict(X_test)
a = Y_test[b == Y_test]

In [33]:
a.shape

(13824,)

In [34]:
Y_test.shape

(19998,)

In [35]:
a.shape[0]/19998

0.6912691269126913

In [36]:
c = b[b == 1.0]
c.shape

(13578,)

In [38]:
a[a==1.0].shape

(9383,)

In [39]:
a[a==0.0].shape

(4441,)

In [41]:
9383/13578

0.6910443364265724

In [42]:
4441/(13578)

0.32707320665782885

In [46]:
gb_pred = gb.predict(X_test)

lr.fit(Xtr,Ytr)

lr_pred = lr.predict(X_test)

gb_pred[gb_pred==lr_pred].shape

(19320,)

In [47]:
gb_pred.shape

(19998,)

In [48]:
gb_pred[gb_pred==Y_test].shape

(13832,)

In [49]:
lr_pred[lr_pred==Y_test].shape

(13828,)

In [51]:
(19998-19320)/19998

0.033903390339033904