In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
## models tried
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.linear_model import Perceptron

#### to be done

# known
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.bagging import BaggingClassifier

# unknown
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.neighbors.classification import RadiusNeighborsClassifier
#from sklearn.svm.classes import OneClassSVM
from sklearn.ensemble.forest import ExtraTreesClassifier
#from sklearn.neighbors import NearestCentroid
#from sklearn.svm import NuSVC

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
## mixture model ??
from sklearn.mixture import GaussianMixture



In [2]:
## Helper functions ##

def getData(needScaled):
    scaler = StandardScaler()
    
    
    
    if (needScaled):
        X_train = np.load('mmscaled_db_train.npy')
        X_test = np.load('mmscaled_db_test.npy')
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_train = X_train[:,:-1]
        X_test = X_test[:,:-1]
        return X_train, X_test
    else:
        X_train = np.load('mmscaled_db_train.npy')
        X_test = np.load('mmscaled_db_test.npy')
        X_train = X_train[:,:-1]
        X_test = X_test[:,:-1]

        return X_train, X_test

def printResults(clf_final, X_test, y_test):
    y_pred = clf_final.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [3]:
np_loaded = np.load('db.npy')
y = np_loaded[:, -1]
X = np_loaded[:, :-1]

In [4]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
X_train_unscaled,X_test_unscaled,y_train,y_test = [],[],[],[]
for train_index, test_index in sss.split(X, y):
    X_train_unscaled, X_test_unscaled = X[train_index], X[test_index]
    y_train, y_test= y[train_index], y[test_index]

In [2]:
# np_scaled_loaded = np.load('scaled_db.npy')
# y = np_scaled_loaded[:, -1]
# X = np_scaled_loaded[:, :-1]

In [5]:
## Scaling ##
scaler = MinMaxScaler()
scaler.fit(X_train_unscaled)
X_train_scaled  = scaler.transform(X_train_unscaled)
X_test_scaled = scaler.transform(X_test_unscaled)

scaled_data_train = np.c_[X_train_scaled, y_train]
np.save('mmscaled_db_train.npy', scaled_data_train)

scaled_data_test = np.c_[X_test_scaled, y_test]
np.save('mmscaled_db_test.npy', scaled_data_test)


unscaled_data_train = np.c_[X_train_unscaled, y_train]
np.save('unscaled_db_train.npy', unscaled_data_train)

unscaled_data_test = np.c_[X_test_unscaled, y_test]
np.save('unscaled_db_test.npy', unscaled_data_test)


In [11]:
# data = np.load('unscaled_db_test.npy')
# dic = {}
# for i in data:
#     yr = i[2]
#     if yr in dic:
#         dic[yr].append(i)
#     else:
#         dic[yr] = [i]
# for yr in dic:
#     s = str(int(yr))+"testData"
#     np.save(s, np.array(dic[yr]))


In [32]:
# Gradient boosting classifier
clf_final = GradientBoostingClassifier(n_estimators=100, learning_rate=0.9,random_state=0)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
'''
0.7753222836095764
[[2964  541]
 [ 801 1667]]
 
'''

1.0
[[3505    0]
 [   0 2468]]


In [38]:
## Decision Tree ##
print("")
print("Decision Tree")
print("")

needScaled = False
X_train, X_test = getData(needScaled)

clf = DecisionTreeClassifier(random_state=0)
depth = [i for i in range(5, 16)]
depth.append(None)
min_samples_split = [i for i in range(2, 40, 3)]
min_samples_leaf = [i for i in range(1,10)]
param_dist = {
    "criterion" : ["gini", "entropy"],
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "min_impurity_decrease" : [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1],
    "max_depth" : depth
}
grid = GridSearchCV(clf, param_grid=param_dist, n_jobs=-1, cv = 10)
grid.fit(X_train, y_train)
clf_final = grid.best_estimator_
print(clf_final)
f = open('DecisionTreePickle', 'wb') 
pickle.dump(clf_final, f)                      
dbfile.close()

print("")
printResults(clf_final, X_test, y_test)



Decision Tree

0.6360287962497907
DecisionTreeClassifier(random_state=0)

0.6360287962497907
[[2440 1065]
 [1109 1359]]


In [None]:
## Random Forest ##
print("")
print("Random Forest")
print("")
needScaled = False
X_train, X_test = getData(needScaled)

clf = RandomForestClassifier(random_state=0)
depth = [i for i in range(5, 16)]
depth.append(None)
n_estimators = [i for i in range(50, 200, 5)]
param_dist = {
    "criterion" : ["gini", "entropy"],
    "n_estimators": n_estimators,
    "bootstrap": [True, False],
    "oob_score": [True, False],
    "max_depth" : depth
}
grid = GridSearchCV(clf, param_grid=param_dist, n_jobs=-1, cv = 10)
grid.fit(X_train, y_train)
clf_final = grid.best_estimator_
print(clf_final)
f = open('RandomForestPickle', 'wb') 
pickle.dump(clf_final, f)                      
dbfile.close()

print("")
printResults(clf_final, X_test, y_test)

In [None]:
## Logistic Regression ##
print("")
print("Logistic Regression")
print("")
needScaled = True
X_train, X_test = getData(needScaled)

clf = LogisticRegression(random_state=0)
penalty = ['l1', 'l2', 'elasticnet']
max_iter = [100, 500, 1000, 5000, 10000]
C = [i/10 for i in range(1,20)]
param_dist = {
    "penalty" : penalty,
    "max_iter" : max_iter,
    "C":C,
    "fit_intercept":[True, False],
    
    
}
grid = GridSearchCV(clf, param_grid=param_dist, n_jobs=-1, cv = 10)
grid.fit(X_train, y_train)
clf_final = grid.best_estimator_
print(clf_final)
f = open('LogisticRegressionPickle', 'wb') 
pickle.dump(clf_final, f)                      
dbfile.close()

print("")
printResults(clf_final, X_test, y_test)

In [None]:
## SVM ##
print("")
print("SVM")
print("")
needScaled = True
X_train, X_test = getData(needScaled)

clf = SVC(random_state=0)
max_iter = [100, 500, 1000, 5000, 10000]

C = [i/10 for i in range(1,20)]
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
gammas = ['scale', 'auto', 0.05, 0.1, 0.15, 0.2]
param_dist = {
    "kernel" : kernels,
    "gamma" : gammas,
    "C" : C,
    "shrinking" : [True, False],
    "max_iter" : max_iter
}
grid = GridSearchCV(clf, param_grid=param_dist, n_jobs=-1, cv = 10)
grid.fit(X_train, y_train)
clf_final = grid.best_estimator_
print(clf_final)
f = open('SVMPickle', 'wb') 
pickle.dump(clf_final, f)                      
dbfile.close()

print("")
printResults(clf_final, X_test, y_test)

In [39]:

## GNB ##
print("")
print("Gaussian Naive Bayes")
print("")
needScaled = False

X_train, X_test = getData(needScaled)

clf = GaussianNB()
clf.fit(X_train, y_train)
# X_test = scaler.transform(X_test)
print(clf.score(X_test, y_test))

f = open('GNBPickleScaled', 'wb') 
pickle.dump(clf, f)                      
f.close()




Gaussian Naive Bayes

0.6497572409174619


In [40]:
## GNB ##
print("")
print("Gaussian Naive Bayes")
print("")
needScaled = True

X_train, X_test = getData(needScaled)

clf = GaussianNB()
clf.fit(X_train, y_train)
# X_test = scaler.transform(X_test)
print(clf.score(X_test, y_test))


f = open('GNBPickleUnscaled', 'wb') 
pickle.dump(clf, f)                      
f.close()



Gaussian Naive Bayes

0.6497572409174619


In [44]:
## KNN ##
print("")
print("K Nearest Neighbors")
print("")
needScaled = True

X_train, X_test = getData(needScaled)
clf = KNeighborsClassifier()
n_neighbors = [2,4,6,8,10]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = [i for i in range(10, 51, 10)]

param_dist = {
    "n_neighbors" : n_neighbors,
    "algorithm" : algorithm,
    "weights" : weights,
    "leaf_size" : leaf_size
}
grid = GridSearchCV(clf, param_grid=param_dist, n_jobs=-1, cv = 10)
grid.fit(X_train, y_train)
clf_final = grid.best_estimator_
print(clf_final)
f = open('KNNPickle', 'wb') 
pickle.dump(clf_final, f)                      
f.close()

print("")
printResults(clf_final, X_test, y_test)


K Nearest Neighbors



KeyboardInterrupt: 

In [42]:
## Perceptron ##
print("")
print("Perceptron")
print("")
needScaled = False
X_train, X_test = getData(needScaled)
penalty = ['l2','l1','elasticnet']
alpha = [0.0001*(10**i) for i in range(0,5)]
max_iter = [10000]
param_dist = {
    "penalty" : penalty,
    "alpha" : alpha,
    "max_iter" : max_iter
}

clf = Perceptron()
grid = GridSearchCV(clf, param_grid=param_dist, n_jobs=-1, cv = 10)
grid.fit(X_train, y_train)
clf_final = grid.best_estimator_
print(clf_final)
f = open('PerceptronPickle', 'wb') 
pickle.dump(clf_final, f)                      
f.close()

print("")
printResults(clf_final, X_test, y_test)


Perceptron

Perceptron(max_iter=10000, penalty='l1')

0.6224677716390423
[[3485   20]
 [2235  233]]


In [14]:
l = {"KNNPickle":True, "PerceptronPickle":False, "GNBPickleScaled":True, "LogisticRegressionPickle":True, "RandomForestPickle":False, "DecisionTreePickle":False,"GNBPickleUnscaled":False}

for i in l:
    X_train, X_test = getData(l[i])
    f = open(i, 'rb')
    print(i)
    model = pickle.load(f)
    print(model)
    printResults(model, X_test, y_test)


KNNPickle
KNeighborsClassifier(leaf_size=10, n_neighbors=10)
0.6653273062112841
[[2800  705]
 [1294 1174]]
PerceptronPickle
Perceptron(max_iter=10000, penalty='l1')
0.6224677716390423
[[3485   20]
 [2235  233]]
GNBPickleScaled
GaussianNB()
0.6279926335174953
[[2591  914]
 [1308 1160]]
LogisticRegressionPickle
LogisticRegression(C=0.2, fit_intercept=False, random_state=0)
0.6854177130420225
[[2896  609]
 [1270 1198]]
RandomForestPickle
RandomForestClassifier(bootstrap=False, max_depth=13, n_estimators=150,
                       random_state=0)
0.7264356269881131
[[2996  509]
 [1125 1343]]
DecisionTreePickle
DecisionTreeClassifier(criterion='entropy', max_depth=10,
                       min_impurity_decrease=0, min_samples_leaf=6,
                       min_samples_split=38, random_state=0)
0.7001506780512305
[[2797  708]
 [1083 1385]]
GNBPickleUnscaled
GaussianNB()
0.6243093922651933
[[3125  380]
 [1864  604]]
