In [15]:
# Importando bibliotecas necesarias
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
import time

In [17]:
# Cargando el dataset procesado
df = pd.read_csv('processed_dataset.csv')

In [18]:
# Separar características y objetivo
X = df.drop('Survived', axis=1)
y = df['Survived']

In [19]:
# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Definir modelos y sus hiperparámetros
models = {
    'Naive Bayes': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LGBM': lgb.LGBMClassifier()
}

param_grids = {
    'Naive Bayes': {},
    'LDA': {},
    'QDA': {},
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'Decision Tree': {'max_depth': [None, 10, 20, 30]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'AdaBoost': {'n_estimators': [50, 100, 200]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
    'LGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}

results = []

In [21]:
# Entrenando y evaluando cada modelo con GridSearchCV
for model_name, model in models.items():
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    training_time = end_time - start_time
    
    results.append({
        'Model': model_name,
        'Best Params': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Training Time': training_time
    })




[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 381
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376098 -> initscore=-0.506142
[LightGBM] [Info] Start training from score -0.506142
[LightGBM] [Info] Number of positive: 214, number of negative: 355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 381
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 13
[LightGBM] [Info] [binary:BoostF

In [22]:
# Convertiendo los resultados a DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)
print(results_df)

                  Model                                   Best Params  \
6         Random Forest        {'max_depth': 20, 'n_estimators': 200}   
8     Gradient Boosting    {'learning_rate': 0.1, 'n_estimators': 50}   
9               XGBoost    {'learning_rate': 0.1, 'n_estimators': 50}   
4                   SVM                     {'C': 1, 'kernel': 'rbf'}   
3   Logistic Regression                                      {'C': 1}   
10                 LGBM  {'learning_rate': 0.01, 'n_estimators': 200}   
1                   LDA                                            {}   
7              AdaBoost                          {'n_estimators': 50}   
0           Naive Bayes                                            {}   
5         Decision Tree                             {'max_depth': 10}   
2                   QDA                                            {}   

    Accuracy  Precision    Recall  F1 Score  Training Time  
6   0.832402   0.832468  0.832402  0.830689      36.840348  
8

In [24]:
# Guardando resultados en Excel
results_df.to_excel('model_comparison.xlsx', index=False)