In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import pickle
import yaml
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

# SUPERVISED CLASSIFICATION MODELS

In [3]:
df_train = pd.read_csv("..\\data\\datasets\\train.csv", sep=";")
df_test = pd.read_csv("..\\data\\datasets\\test.csv", sep=";")

In [4]:
df_train.drop(columns="Unnamed: 0", inplace=True)
df_test.drop(columns="Unnamed: 0", inplace=True)

In [5]:
X_train = df_train.drop(columns=["name_img","name_img_encode"])
y_train = df_train["name_img_encode"]

In [6]:
# LogisticRegression

reg_log = Pipeline(steps = [
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression())
])
reg_log_param = {
    "reglog__penalty": ['l1', 'l2'],
    "reglog__C": np.logspace(0, 4, 10)
}

# Random forest 
rand_forest =  RandomForestClassifier()
rand_forest_param = {
    "n_estimators": [10, 100, 1000],
    "max_features": [3,4,5]
}

# SVC
svm = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("svm", SVC())
])
svm_param = {
    'selectkbest__k': [4, 8],
    'svm__kernel': ['linear', 'rbf'],
    'svm__C': [0.01, 0.1, 0.5],
    'svm__degree': [1,2,3],
    'svm__gamma': ['scale', 'auto']
}

# KNM
knc =  KNeighborsClassifier()
knc_param = {
    'n_neighbors' : [5,6,7,8]
}

# Decision Tree
dtree_clf = Pipeline(steps=( 
    ("scaler", StandardScaler()),
    ("dtree", DecisionTreeClassifier())
))

dtree_param = {
    "dtree__max_depth": [50,100],
    "dtree__max_leaf_nodes": [20,30],
}

gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv = 3,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs=-1)

gs_rand_forest = GridSearchCV(rand_forest,
                         rand_forest_param,
                         cv = 3,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs=-1)

gs_svm = GridSearchCV(svm,
                         svm_param,
                         cv = 3,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs=-1)

gs_knc= GridSearchCV(knc,
                         knc_param,
                         cv = 3,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs=-1)

gs_dtree= GridSearchCV(dtree_clf,
                         dtree_param,
                         cv = 3,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs=-1)

grids = {"gs_reg_log": gs_reg_log,
        "gs_rand_forest": gs_rand_forest,
        "gs_svm": gs_svm,
        "gs_knc": gs_knc,
        "gs_dtree": gs_dtree
        }

In [9]:

for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 72 candidates, totalling 216 fits
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [10]:
X_test = df_test.drop(columns=["name_img","name_img_encode"])
y_test = df_test["name_img_encode"]

In [11]:
l_err = {"name": [],"accuracy": []}
for nombre, grid_search in grids.items():
    y_pred = grid_search.best_estimator_.predict(X_test)
    l_err["name"].append(nombre)
    l_err["accuracy"].append(accuracy_score(y_test, y_pred))


In [12]:
model_data = pd.DataFrame(l_err, columns=l_err.keys())
model_data.to_csv("..\\data\\datasets\\sm_acc.csv")

In [13]:
model_data

Unnamed: 0,name,accuracy
0,gs_reg_log,0.511811
1,gs_rand_forest,0.639227
2,gs_svm,0.310666
3,gs_knc,0.364352
4,gs_dtree,0.360057


In [14]:
for nombre, grid_search in grids.items():
    best_estimator = grid_search.best_estimator_
    best_params = grid_search.best_params_

    with open(f'..\\models\\supervissed\\{nombre}_model.pkl', 'wb') as archivo_salida:
       pickle.dump(best_estimator, archivo_salida)

    with open(f'..\\models\\supervissed\\{nombre}_model.yml', 'w') as archivo_yaml:
        yaml.dump(best_params, archivo_yaml, default_flow_style=False)