In [20]:
import pandas as pd
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, GradientBoostingClassifier)
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

Cargamos el archivo y comprobamos que los datos efectivamente están limpios

In [21]:
df = pd.read_csv('Train_recoded.csv')
df.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
VIP             0
Name            0
Transported     0
Algun_Gasto     0
CabinDeck       0
CabinSide       0
Group           0
GroupNumber     0
Last Name       0
Solo            0
Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Gastos          0
CabinNumber     0
GroupSize       0
AgeGroup        0
dtype: int64

Recoficamos las variables categóricas en numéricas asignandoles valores numéricos en orden (0, 1, 2...)

In [22]:
df.loc[:, 'HomePlanet'] = df['HomePlanet'].map({'Europa': 0, 'Earth': 1, 'Mars': 2}).astype(int)
df.loc[:, 'CabinDeck'] = df['CabinDeck'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}).astype(int)
df.loc[:, 'CabinSide'] = df['CabinSide'].map({'P': 0, 'S': 1})
df.loc[:, 'CryoSleep'] = df['CryoSleep'].astype(int)
df.loc[:, 'Destination'] = df['Destination'].map({'TRAPPIST-1e': 0, 'PSO J318.5-22': 1, '55 Cancri e': 2}).astype(int)

  df.loc[:, 'CryoSleep'] = df['CryoSleep'].astype(int)


Separamos los conjuntos de entrenamiento y test, se eliminan además aquellas columnas que no vayan a usarse para el entrenamiento.

In [23]:
columnas_no_usadas = ["PassengerId", "Name", "Last Name", "Cabin"]

df = df.drop(columns = columnas_no_usadas)

features_eliminadas = ['Solo']

y = df['Transported']
X = df.drop(['Transported'] + features_eliminadas, axis = 1).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

eval_set = [(X_train, y_train), (X_test, y_test)]

Hiperparametrización de los modelos de ensemble

In [25]:
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 300, 1),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'gamma': hp.uniform('gamma', 0, 5),
    'reg_alpha': hp.quniform('reg_alpha', 0, 10, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 10),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),  # Nuevo parámetro
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),  # Nuevo parámetro
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10)  # Nuevo parámetro
}

def objective(space):
    clf = XGBClassifier(
                n_estimators=int(space['n_estimators']),
                max_depth=int(space['max_depth']),
                gamma=space['gamma'],
                reg_alpha=int(space['reg_alpha']),
                reg_lambda=space['reg_lambda'],
                min_child_weight=int(space['min_child_weight']),
                colsample_bytree=space['colsample_bytree'],
                subsample=space['subsample'],
                learning_rate=space['learning_rate'],
                scale_pos_weight=space['scale_pos_weight'],
                eval_metric='logloss'
            )
    
    evaluation = [(X_train, y_train), (X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,
            verbose=False)
    
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred > 0.5)
    
    return {'loss': -accuracy, 'status': STATUS_OK}

trials = Trials()

best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=100,  # Número de evaluaciones que deseas hacer
                        trials=trials)

best_params = {
    'n_estimators': int(best_hyperparams['n_estimators']),
    'max_depth': int(best_hyperparams['max_depth']),
    'gamma': best_hyperparams['gamma'],
    'reg_alpha': int(best_hyperparams['reg_alpha']),
    'reg_lambda': best_hyperparams['reg_lambda'],
    'min_child_weight': int(best_hyperparams['min_child_weight']),
    'colsample_bytree': best_hyperparams['colsample_bytree'],
    'subsample': best_hyperparams['subsample'],
    'learning_rate': best_hyperparams['learning_rate'],
    'scale_pos_weight': best_hyperparams['scale_pos_weight'],
    'eval_metric': 'logloss'
}

clf = XGBClassifier(**best_params)

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 100, log=True),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'random_seed': 42,
        'verbose': 0
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=0)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

# Ejecutar la optimización
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Obtener los mejores hiperparámetros
best_params = study.best_params
print('Mejores hiperparámetros:', best_params)

cat_best = CatBoostClassifier(**best_params)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [300, 500,700,900,1100],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'num_leaves': [20, 30, 40],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Create the LGBMClassifier
lgbm = LGBMClassifier()

# Perform random search with cross-validation
random_search = RandomizedSearchCV(estimator=lgbm, param_distributions=param_grid, n_iter=10, cv=5)

# Fit the random search to the training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters and the best score
best_params = random_search.best_params_
best_score = random_search.best_score_

# Create a new LGBMClassifier with the best hyperparameters
best_lgbm = LGBMClassifier(**best_params)

# Train the model using the best hyperparameters
best_lgbm.fit(X_train, y_train)

100%|██████████| 100/100 [00:37<00:00,  2.66trial/s, best loss: -0.8013327780091628]

[I 2024-09-25 04:58:59,913] A new study created in memory with name: no-name-04223b6b-d140-4f2f-9e5e-eef25b121cc5





[I 2024-09-25 04:59:01,013] Trial 0 finished with value: 0.7488546438983756 and parameters: {'iterations': 390, 'learning_rate': 0.048883447727314226, 'depth': 4, 'l2_leaf_reg': 2.9376695686326375e-05, 'border_count': 1}. Best is trial 0 with value: 0.7488546438983756.
[I 2024-09-25 04:59:03,192] Trial 1 finished with value: 0.8092461474385673 and parameters: {'iterations': 520, 'learning_rate': 0.048710327688348466, 'depth': 6, 'l2_leaf_reg': 23.267462282614357, 'border_count': 194}. Best is trial 1 with value: 0.8092461474385673.
[I 2024-09-25 04:59:11,876] Trial 2 finished with value: 0.7975843398583924 and parameters: {'iterations': 508, 'learning_rate': 0.1530211617265362, 'depth': 10, 'l2_leaf_reg': 0.09205723128350364, 'border_count': 189}. Best is trial 1 with value: 0.8092461474385673.
[I 2024-09-25 04:59:14,944] Trial 3 finished with value: 0.7996668054977093 and parameters: {'iterations': 774, 'learning_rate': 0.20965624894347554, 'depth': 7, 'l2_leaf_reg': 0.001853247704535

Mejores hiperparámetros: {'iterations': 262, 'learning_rate': 0.13791549294044655, 'depth': 10, 'l2_leaf_reg': 10.93192466697037, 'border_count': 241}
[LightGBM] [Info] Number of positive: 2235, number of negative: 2245
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2157
[LightGBM] [Info] Number of data points in the train set: 4480, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498884 -> initscore=-0.004464
[LightGBM] [Info] Start training from score -0.004464
[LightGBM] [Info] Number of positive: 2235, number of negative: 2245
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2158
[LightGBM] [Info] Number of d

Generamos el clasificador por votos

In [26]:
voting = VotingClassifier(estimators=[('lgbm',best_lgbm),('cat', cat_best), ('xg', clf)],voting='soft')
voting.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 2793, number of negative: 2807
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2160
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498750 -> initscore=-0.005000
[LightGBM] [Info] Start training from score -0.005000
0:	learn: 0.6228664	total: 25ms	remaining: 6.52s
1:	learn: 0.5700119	total: 51.7ms	remaining: 6.71s
2:	learn: 0.5301906	total: 77.5ms	remaining: 6.69s
3:	learn: 0.5021677	total: 99.1ms	remaining: 6.39s
4:	learn: 0.4767175	total: 121ms	remaining: 6.2s
5:	learn: 0.4632612	total: 142ms	remaining: 6.07s
6:	learn: 0.4512435	total: 161ms	remaining: 5.87s
7:	learn: 0.4413755	total: 181ms	remaining: 5.73s
8:	learn: 0.4307094	total: 200ms	remaining: 

In [27]:
scores = []
y_pred2 = voting.predict(X_test)
acc_final = accuracy_score(y_test,y_pred2)
scores.append(['Final soft voting with xg', acc_final])
print("Accuracy",acc_final)

Accuracy 0.8059142024156601


In [28]:
df_t = pd.read_csv('Test_recoded.csv')

df_test = df_t.drop(columns = ['Transported'] + columnas_no_usadas + features_eliminadas)

df_test.loc[:, 'HomePlanet'] = df_test['HomePlanet'].map({'Europa': 0, 'Earth': 1, 'Mars': 2}).astype(int)
df_test.loc[:, 'CabinDeck'] = df_test['CabinDeck'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}).astype(int)
df_test.loc[:, 'CabinSide'] = df_test['CabinSide'].map({'P': 0, 'S': 1}).astype(int)
df_test.loc[:, 'CryoSleep'] = df_test['CryoSleep'].astype(int)
df_test.loc[:, 'Destination'] = df_test['Destination'].map({'TRAPPIST-1e': 0, 'PSO J318.5-22': 1, '55 Cancri e': 2}).astype(int)

df_test = df_test.astype(int)

prediccion = voting.predict(df_test)

df_predict = pd.DataFrame({
    'PassengerId': df_t['PassengerId'],
    'Transported': prediccion.astype(bool)
})

df_predict.to_csv('prediccion_stack_nosolo.csv', index = False)

  df_test.loc[:, 'CryoSleep'] = df_test['CryoSleep'].astype(int)
