In [1]:
import pandas as pd
import numpy as np
# import pickle
import joblib
from pickle import dump

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

In [2]:
df = pd.read_csv('./Data/SumData.csv')

In [3]:
df

Unnamed: 0,gravity1,gravity2,gravity3,gravity4,gravity5,gravity6,gravity7,gravity8,gravity9,gravity10,...,comFY22,comFY23,comFY24,comFY25,comFY26,comFY27,comFY28,comFY29,comFY30,y_value
0,85.855,82.771,81.049,79.837,79.404,79.284,80.436,80.833,81.713,80.567,...,-5.509,-5.443,-5.348,-5.340,-5.422,-5.504,-5.462,-5.421,-5.489,0
1,82.771,81.049,79.837,79.404,79.284,80.436,80.833,81.713,80.567,81.566,...,-5.443,-5.348,-5.340,-5.422,-5.504,-5.462,-5.421,-5.489,-5.436,0
2,81.049,79.837,79.404,79.284,80.436,80.833,81.713,80.567,81.566,81.695,...,-5.348,-5.340,-5.422,-5.504,-5.462,-5.421,-5.489,-5.436,-5.377,0
3,79.837,79.404,79.284,80.436,80.833,81.713,80.567,81.566,81.695,80.505,...,-5.340,-5.422,-5.504,-5.462,-5.421,-5.489,-5.436,-5.377,-5.385,0
4,79.404,79.284,80.436,80.833,81.713,80.567,81.566,81.695,80.505,80.660,...,-5.422,-5.504,-5.462,-5.421,-5.489,-5.436,-5.377,-5.385,-5.469,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109296,94.000,89.000,86.000,87.000,87.000,85.000,81.000,79.000,77.000,76.000,...,-9.295,-9.201,-9.096,-8.981,-8.856,-8.759,-8.689,-8.616,-8.547,0
109297,89.000,86.000,87.000,87.000,85.000,81.000,79.000,77.000,76.000,76.000,...,-9.201,-9.096,-8.981,-8.856,-8.759,-8.689,-8.616,-8.547,-8.499,0
109298,86.000,87.000,87.000,85.000,81.000,79.000,77.000,76.000,76.000,75.000,...,-9.096,-8.981,-8.856,-8.759,-8.689,-8.616,-8.547,-8.499,-8.467,0
109299,87.000,87.000,85.000,81.000,79.000,77.000,76.000,76.000,75.000,75.000,...,-8.981,-8.856,-8.759,-8.689,-8.616,-8.547,-8.499,-8.467,-8.433,0


In [4]:
y = df['y_value']
X = df.loc[:, df.columns != 'y_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
cols = X_train.columns
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train.values)
# new data frame with the new scaled data. 
X_train = pd.DataFrame(X_train, columns = cols)
X_test = min_max_scaler.transform(X_test.values)
X_test = pd.DataFrame(X_test, columns = cols)


In [6]:
X_train

Unnamed: 0,gravity1,gravity2,gravity3,gravity4,gravity5,gravity6,gravity7,gravity8,gravity9,gravity10,...,comFY21,comFY22,comFY23,comFY24,comFY25,comFY26,comFY27,comFY28,comFY29,comFY30
0,0.152364,0.155272,0.156301,0.156590,0.161556,0.157996,0.162733,0.161520,0.164409,0.166092,...,0.517832,0.518461,0.518658,0.518596,0.518790,0.518990,0.519651,0.520653,0.521775,0.522990
1,0.145347,0.146976,0.145735,0.148956,0.152314,0.145349,0.139380,0.125697,0.109885,0.095562,...,0.461850,0.461039,0.460612,0.460422,0.459870,0.459212,0.458590,0.457813,0.456949,0.456051
2,0.117342,0.089853,0.081883,0.073200,0.071725,0.063337,0.060100,0.062265,0.071948,0.081893,...,0.457536,0.456802,0.456310,0.455321,0.454169,0.453176,0.452401,0.452952,0.453963,0.455540
3,0.399726,0.193975,0.285645,0.108206,0.021565,0.053313,0.121578,0.190932,0.182512,0.151320,...,0.211695,0.207757,0.204554,0.201680,0.197995,0.194151,0.190330,0.186375,0.183002,0.179839
4,0.129360,0.122829,0.114486,0.112969,0.118934,0.122397,0.134811,0.138042,0.139458,0.134003,...,0.501740,0.500323,0.497975,0.493988,0.489133,0.484049,0.478985,0.473828,0.469179,0.465008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81970,0.138993,0.139656,0.140856,0.137794,0.142393,0.139656,0.138980,0.136381,0.137566,0.135722,...,0.487810,0.487722,0.487737,0.487728,0.487728,0.487734,0.487837,0.487822,0.487822,0.487837
81971,0.137129,0.135932,0.137129,0.137794,0.142393,0.137794,0.138980,0.136381,0.135722,0.137566,...,0.514388,0.514394,0.514320,0.514305,0.514305,0.514305,0.514315,0.514305,0.514299,0.514320
81972,0.146447,0.147105,0.142720,0.156415,0.152014,0.143381,0.150249,0.151125,0.154166,0.156010,...,0.350434,0.353126,0.356134,0.358992,0.361731,0.364340,0.366984,0.369835,0.372645,0.375489
81973,0.162505,0.166273,0.168364,0.169712,0.175449,0.171332,0.176621,0.180054,0.187321,0.195233,...,0.493952,0.491172,0.488654,0.487340,0.486482,0.484819,0.483005,0.481927,0.481957,0.483646


In [7]:
def model_assess(model,title = "Default"):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    #print(confusion_matrix(y_test, preds))
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 5), '\n')

In [8]:
nb = GaussianNB()
model_assess(nb, "Naive Bayes")

# Stochastic Gradient Descent
sgd = SGDClassifier(max_iter=5000, random_state=0)
model_assess(sgd, "Stochastic Gradient Descent")

# KNN
# knn = KNeighborsClassifier(n_neighbors=19)
# model_assess(knn, "KNN")

# Decission trees
tree = DecisionTreeClassifier()
model_assess(tree, "Decission trees")

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model_assess(rforest, "Random Forest")

# Support Vector Machine
svm = SVC(decision_function_shape="ovo")
model_assess(svm, "Support Vector Machine")

# Logistic Regression
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model_assess(lg, "Logistic Regression")

# Neural Nets
# nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5000, 10), random_state=1)
# model_assess(nn, "Neural Nets")

# Cross Gradient Booster
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_assess(xgb, "Cross Gradient Booster")

# Cross Gradient Booster (Random Forest)
# xgbrf = XGBRFClassifier(objective= 'multi:softmax')
# model_assess(xgbrf, "Cross Gradient Booster (Random Forest)")

Accuracy Naive Bayes : 0.72458 

Accuracy Stochastic Gradient Descent : 0.96333 

Accuracy Decission trees : 0.98112 

Accuracy Random Forest : 0.98218 

Accuracy Support Vector Machine : 0.96992 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Logistic Regression : 0.96553 

Accuracy Cross Gradient Booster : 0.99436 



In [5]:
dump(min_max_scaler, open('scaler_ver2.pkl', 'wb'))
joblib.dump(xgb, './xgb_model_ver2.pkl')