In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
import time
import scipy

In [2]:
X = load = scipy.sparse.load_npz('../new_data/X_sparse.npz')
Y = np.genfromtxt('../new_data/Y.csv', delimiter=',')[1:]

In [3]:
X.shape

(100000, 5000)

In [None]:
# X = X[1:]
# Y = Y[1:]

# X[:,:-1]

In [6]:
X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [None]:
# Xtr, Xva = Xtr[:,:-1], Xva[:,:-1]

In [7]:
lr = LogisticRegression(C=0.1, penalty='l2')

mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(200, 50, 50), activation='relu', 
                            solver='lbfgs', alpha=0.0001, learning_rate='constant')

gb = GradientBoostingClassifier(loss="deviance", learning_rate=0.2, 
                                n_estimators=1500, max_depth=3,
                                min_samples_split=6,min_samples_leaf=1,
                                max_features='sqrt', subsample=0.95)


### Combined model with same weights

In [8]:
combined = VotingClassifier(estimators=[('lr', lr), ('nn', mlp), ('gb', gb)],
                         voting='soft', flatten_transform=True) # weights=?

print("training started")
combined.fit(Xtr,Ytr)
print("training finished")

combined_roc = roc_auc_score(Yva, combined.predict_proba(Xva)[:,1])
print("validation roc:", combined_roc)

print("training error:", 1 - combined.score(Xtr, Ytr))
print("validation error:", 1 - combined.score(Xva, Yva))

training started




training finished
validation roc: 0.825329841930367
training error: 0.043187500000000045
validation error: 0.2630625


### Combined weights with different weights

In [13]:
# weights <= auc

combined = VotingClassifier(estimators=[('lr', lr), ('nn', mlp), ('gb', gb)],
                         voting='soft', flatten_transform=True, weights=[0.828,0.769,0.834]) # weights=?

print("training started")
combined.fit(Xtr,Ytr)
print("training finished")

combined_roc = roc_auc_score(Yva, combined.predict_proba(Xva)[:,1])
print("validation roc:", combined_roc)

print("training error:", 1 - combined.score(Xtr, Ytr))
print("validation error:", 1 - combined.score(Xva, Yva))

training started




training finished
validation roc: 0.828398406788387
training error: 0.22815624999999995
validation error: 0.2508125


In [11]:
w = np.array([0.828,0.769,0.834])
w = w/sum((0.828,0.769,0.834))

In [12]:
w

array([0.34060058, 0.31633073, 0.3430687 ])

In [15]:
# weights <= normalization form of auc

combined = VotingClassifier(estimators=[('lr', lr), ('nn', mlp), ('gb', gb)],
                         voting='soft', flatten_transform=True, weights=w)

print("training started")
combined.fit(Xtr,Ytr)
print("training finished")

combined_roc = roc_auc_score(Yva, combined.predict_proba(Xva)[:,1])
print("validation roc:", combined_roc)

print("training error:", 1 - combined.score(Xtr, Ytr))
print("validation error:", 1 - combined.score(Xva, Yva))

training started




training finished
validation roc: 0.8262917169782135
training error: 0.049046875000000045
validation error: 0.26068749999999996


In [16]:
combined_roc = roc_auc_score(Y_test, combined.predict_proba(X_test)[:,1])
print("validation roc:", combined_roc)

validation roc: 0.8303018142004345


In [17]:
# weights <= auc ranking

combined = VotingClassifier(estimators=[('lr', lr), ('nn', mlp), ('gb', gb)],
                         voting='soft', flatten_transform=True, weights=[2,1,3])

print("training started")
combined.fit(Xtr,Ytr)
print("training finished")

combined_roc = roc_auc_score(Yva, combined.predict_proba(Xva)[:,1])
print("validation roc:", combined_roc)

print("training error:", 1 - combined.score(Xtr, Ytr))
print("validation error:", 1 - combined.score(Xva, Yva))

combined_roc = roc_auc_score(Y_test, combined.predict_proba(X_test)[:,1])
print("validation roc of Test data:", combined_roc)

training started




training finished
validation roc: 0.8340148294773557
training error: 0.11556250000000001
validation error: 0.24175000000000002
validation roc of Test data: 0.8377279955660373


### Model with weights = [2,1,3]  performs the best

# Save predicted test data and the model

In [18]:
# original_indexes = []
# for l in X_test:
#     original_indexes.append(l[-1])

In [19]:
# Y_test = np.vstack((np.arange(X_test.shape[0]), original_indexes, combined.predict_proba(X_test[:,:-1])[:,1])).T
# np.savetxt('Y_test.txt',Y_test,'%d,%d,%.2f',header='Id,Original_Id,Predicted',comments='',delimiter=',')

In [20]:
# df_X_test = pd.DataFrame(X_test)
# df_X_test.to_csv('X_test.csv',index=False)

In [21]:
# df_X_test = pd.DataFrame(Y_test)
# df_X_test.to_csv('X_test.csv',index=False)

In [22]:
import joblib

In [23]:
joblib.dump(combined, '../new_data/combinedModel.sav')

['../new_data/combinedModel.sav']

In [27]:
# loaded_model = joblib.load('../new_data/combinedModel.sav')