### PROYECTO DE CURSO – FASE 2
#### Desarrollo de modelo de clasificación
##### Monica Velasquez y Enrique Rodriguez

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from datetime import datetime
import time
import warnings

In [11]:
warnings.filterwarnings('ignore')

# Cargar los datos
data = pd.read_csv('bank-additional-full-transformed.csv')

In [12]:
# Separar características y variable objetivo
X = data.drop(columns=['y_no', 'y_yes'])
y = data['y_yes']  # 1 para 'yes', 0 para 'no'

In [13]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Lista de modelos y sus hiperparámetros para GridSearchCV
models = [
    (GaussianNB(), {}),
    (LinearDiscriminantAnalysis(), {}),
    (LogisticRegression(), {'C': [0.01, 0.1, 1, 10, 100]}),
    (SVC(), {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}),
    (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30]}),
    (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
    (QuadraticDiscriminantAnalysis(), {}),
    (AdaBoostClassifier(), {'n_estimators': [50, 100, 200]}),
    (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200]}),
    (xgb.XGBClassifier(), {'n_estimators': [50, 100, 200]}),
]

results = []

In [15]:
# Entrenar y evaluar cada modelo con GridSearchCV
for index, (model, params) in enumerate(models):
    start_time = time.time()
    grid = GridSearchCV(model, param_grid=params, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)
    end_time = time.time()
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    end_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    results.append({
        'Index': index,
        'Model': type(best_model).__name__,
        'Best Hyperparameters': grid.best_params_,
        'Accuracy': accuracy,
        'Training Time (seconds)': end_time - start_time,
        'End Datetime': end_datetime
    })

In [16]:

# Crear un DataFrame con los resultados
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)

In [17]:
# Mostrar los resultados
print(results_df)

   Index                          Model       Best Hyperparameters  Accuracy  \
8      8     GradientBoostingClassifier      {'n_estimators': 100}  0.919762   
9      9                  XGBClassifier       {'n_estimators': 50}  0.915635   
5      5         RandomForestClassifier      {'n_estimators': 100}  0.911750   
2      2             LogisticRegression                  {'C': 10}  0.909808   
3      3                            SVC  {'C': 1, 'kernel': 'rbf'}  0.909444   
7      7             AdaBoostClassifier      {'n_estimators': 200}  0.909080   
1      1     LinearDiscriminantAnalysis                         {}  0.908594   
4      4         DecisionTreeClassifier          {'max_depth': 10}  0.905924   
0      0                     GaussianNB                         {}  0.717043   
6      6  QuadraticDiscriminantAnalysis                         {}  0.149308   

   Training Time (seconds)         End Datetime  
8                61.845779  2024-06-30 23:10:31  
9                 2