# Training Pipeline

In [1]:
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import os

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer, f1_score,accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

os.chdir("C:/Users/diego/Desktop/tangelo/")

In [2]:
#Read Transformed Data
df = pd.read_csv("./data/df_trans.csv")
df.rename(columns={'remainder__Target': 'target'}, inplace=True)
df.drop("Unnamed: 0",axis=1,inplace=True)
df.head()

Unnamed: 0,cat__Gender_M,cat__Car_Y,cat__Realty_Y,cat__Work_phone_0,cat__Phone_0,cat__Email_1,cat__income_type_Commercial associate,cat__income_type_Pensioner,cat__income_type_State servant,cat__income_type_Student,...,cat__Occupation_type_Secretaries,cat__Occupation_type_Security staff,cat__Occupation_type_Waiters/barmen staff,num__Count_family_members,num__Years_Employed,num__Age,num__children_count,num__income_amount,education__education_type,target
0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.21768,-0.437004,-0.935614,-0.579661,2.365845,3.0,0
1,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.21768,-0.437004,-0.935614,-0.579661,2.365845,3.0,0
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-0.21768,-0.461209,1.321517,-0.579661,-0.728827,2.0,0
3,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.314564,-0.447762,0.713828,-0.579661,0.818509,2.0,0
4,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.314564,-0.447762,0.713828,-0.579661,0.818509,2.0,0


# Benchmark

### We are going test 5 different models in order to identify the best approach for our problem.
- Applying Oversample in the training data
- The objective metric is F1-Score based upon our imbalanced data problem.
- Using GridSearch and 5 cross-validation
- After that we are going to try to optimize it's hyperparameters

In [5]:
# Features X and Y
X, y = df.drop("target",axis=1),df["target"]
y = y.astype('int')

#Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    stratify=y, test_size=0.3,
                                                    random_state = 10086)

#SMOTE to address the imbalance problem but ONLY in the training data.
#Test data remaing with the same distribution
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns = X.columns)

# Define the models to be evaluated
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('SVM', SVC()),
    ('LightGBM', LGBMClassifier())
]

# Create a dictionary of parameter grids for each model
param_grids = {
    'Logistic Regression': {'model__C': [0.1, 1, 10]},
    'Decision Tree': {'model__max_depth': [None, 10, 20]},
    'Random Forest': {'model__n_estimators': [50, 100, 200]},
    'SVM': {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']},
    'LightGBM': {'model__n_estimators': [50, 100, 200]}
}

# Define F1-Score as the scoring metric
scorer = make_scorer(f1_score, average='macro')

# Create an empty dictionary to store the best models and their scores
best_models = {}

# Iterate over the models and perform GridSearchCV
for model_name, model in tqdm(models,desc ="Progress:"):
    param_grid = param_grids[model_name]
    pipeline = Pipeline([
        ('model', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = {
        'best_estimator': grid_search.best_estimator_,
        'best_score': grid_search.best_score_
    }

# Print the best models and their scores
for model_name, info in best_models.items():
    print(f'{model_name}:')
    print(f'Best Score (F1-Score): {info["best_score"]:.4f}')
    print(f'Best Estimator: {info["best_estimator"]}')

# Training and Optimizing Hyperparameters

The model which has the best performance was       . Now we're gonna proceed to train it and optimize in a more detail way it according our data.