In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from helpers.modeling_helpers import save_model,model_training,save_training_time

# TRAINING MODELS

## LOADING DATA

In [None]:
df_train = pd.read_csv("data/split/train_data.csv",sep=";",index_col=0)
X_train = df_train.drop(columns=['y'])
y_train = df_train["y"]

df_test = pd.read_csv("data/split/test_data.csv",sep=";",index_col=0)
X_test = df_test.drop(columns=['y'])
y_test = df_test["y"]

df_valid = pd.read_csv("data/split/valid_data.csv",sep=";",index_col=0)
X_valid = df_valid.drop(columns=['y'])
y_valid = df_valid["y"]

## LOGISTIC REGRESSION

In [3]:
Cs = [0.0001,0.001,0.01,0.1,1,10]
param_grid = {'C': Cs,}

grid_lr = GridSearchCV(estimator = LogisticRegression(),param_grid=param_grid,scoring="accuracy")
grid_lr.fit(X_train,y_train)
logistic_regression_model = LogisticRegression(C=grid_lr.best_params_['C'])

In [None]:
logistic_regression_model,time = model_training(logistic_regression_model,X_train,y_train)
save_model(logistic_regression_model,"logistic_regression_model")
save_training_time("logistic_regression_model",str(time))


## KNN

In [None]:

acc =[]
ns=[x for x in range(1,50)]
for n in ns:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_valid)
    acc.append(classification_report(y_valid,y_pred,output_dict=True)["accuracy"])
    
knn_model = KNeighborsClassifier(n_neighbors=ns[acc.index(max(acc))])

In [6]:

knn_model,time = model_training(knn_model,X_train,y_train)
save_model(knn_model,"knn_model")
save_training_time("knn_model",str(time))

## NAIVE BAYES

In [None]:
naive_bayes_model = GaussianNB()

naive_bayes_model,time = model_training(naive_bayes_model,X_train,y_train)
save_model(naive_bayes_model,"naive_bayes_model")
save_training_time("naive_bayes_model",str(time))

## DECISION TREE

In [8]:
params={'criterion':['gini','entropy','log_loss'],'max_depth':[10,20,30,40,50,60]}
decisionTree = DecisionTreeClassifier()
grid_dt = GridSearchCV(estimator=decisionTree,param_grid=params,scoring="accuracy")

grid_dt.fit(X_train,y_train)
print(grid_dt.best_params_)

decision_tree_model = DecisionTreeClassifier(criterion=grid_dt.best_params_['criterion'],
                            max_depth=grid_dt.best_params_['max_depth'])

{'criterion': 'log_loss', 'max_depth': 60}


In [None]:
decision_tree_model,time = model_training(decision_tree_model,X_train,y_train)
save_model(decision_tree_model,"decision_tree_model")
save_training_time("decision_tree_model",str(time))

## RANDOM FOREST

In [10]:
params={'n_estimators':[1,10,50,100],'criterion':['gini','entropy','log_loss'],'max_depth':[10,20,30,40,50,60]}
randomForest = RandomForestClassifier()
grid_random_forest = GridSearchCV(estimator=randomForest,param_grid=params,scoring="accuracy")

grid_random_forest.fit(X_train,y_train)
print(grid_random_forest.best_params_)
random_forest_model = RandomForestClassifier(criterion=grid_random_forest.best_params_['criterion'],
                                    max_depth=grid_random_forest.best_params_['max_depth'],
                                    n_estimators=grid_random_forest.best_params_['n_estimators'])

{'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 100}


In [None]:
random_forest_model,time = model_training(random_forest_model,X_train,y_train)
save_model(random_forest_model,"random_forest_model")
save_training_time("random_forest_model",str(time))

## SVM

In [12]:
params={'C':[0.01,0.1,1,10],'kernel':['poly','rbf','sigmoid','linear']}
svc = SVC()
grid_svc = GridSearchCV(estimator=svc,param_grid=params,scoring="accuracy")

grid_svc.fit(X_train,y_train)
print(grid_svc.best_params_)

svc_model=SVC(kernel=grid_svc.best_params_['kernel'],C=grid_svc.best_params_['C'])

{'C': 10, 'kernel': 'rbf'}


In [None]:
svc_model,time = model_training(svc_model,X_train,y_train)
save_model(svc_model,"svc_model")
save_training_time("svc_model",str(time))