# 3ro Entregable del Reto
## Evaluación y Refinamiento de modelo

#### Carga de librerías

In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold,RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

#### Importación de base de datos limpia 
Separación de features de label, estandarización de datos y división de datos entre Train set y Test set para su entrenamiento.

In [64]:
df= pd.read_csv('Titanic_train_test.csv')

Separación de datos entre Train(Entrenamiento) y Test(Prueba)

In [65]:
df=df.drop(columns='Unnamed: 0')

In [66]:
dftrain=df.iloc[:891]
dftest=df.iloc[891:]

In [67]:
dftest=dftest.drop(columns=['Survived'])

Separación de features y target

In [68]:
y=dftrain['Survived']
X=dftrain.drop(columns='Survived')

Escalar los features

In [69]:
scaler=StandardScaler()
Xf=scaler.fit_transform(X)

Separación de los datos en train y test 

In [70]:
X_train, X_test, y_train, y_test = train_test_split(Xf, y, test_size=0.2, random_state=42)

### Grid Search
Creación de Grid search para encontrar los mejores parámetros para el algoritmo Random 

In [80]:
def gridSearch(clf_model, param_grid, name):

    # Define the feature selector
    rfecv_model = RFECV(estimator=clf_model, step=1, cv=StratifiedKFold(5), scoring='roc_auc')

    # Creación de pipeline con el modelo de Random Forest y el selector de features
    pipeline = Pipeline([
        ('feature_selection', rfecv_model),
        ('classification', clf_model)
        ])
    
    # Define the GridSearchCV
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=StratifiedKFold(10), scoring='roc_auc_ovr', n_jobs=-1)

    # Hacerle fit model
    grid_search.fit(X_train, y_train)

    # Get the best parameters and score
    print("Best parameters found for ",name,":", grid_search.best_params_)
    print("Best cross-validation score for ",name,":", grid_search.best_score_)
    

#### Random Forest

In [76]:
# Define the classifier
clf_rf = RandomForestClassifier(random_state=42, class_weight="balanced")


param_grid_rf = {
    'classification__n_estimators': [ 5, 10, 50, 200, 250],
    'classification__max_features': ['sqrt','log2'],
    'classification__max_depth': [4, 6, 8, 10, 12],
    'classification__criterion':['gini','log_loss','entropy'],
    'classification__max_leaf_nodes':[2, 5, 10]
}

gridSearch(clf_rf, param_grid_rf, name = "Random Forest" )

KeyboardInterrupt: 

##### Manual Tunning

In [None]:
rbd = RandomForestClassifier(
    criterion ='gini',
    max_depth = 6,
    max_features = 'sqrt',
    max_leaf_nodes = 20,
    class_weight='balanced',
    n_estimators = 200
)

rbd.fit(X_train, y_train)

y_predRF = rbd.predict(X_test)
accuracy = accuracy_score(y_test, y_predRF)
f1 = f1_score(y_test, y_predRF)
roc_auc = roc_auc_score(y_test, y_predRF)
conf_matrix = confusion_matrix(y_test, y_predRF)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'ROC-AUC: {roc_auc}')
print('Confusion Matrix:')
print(conf_matrix)

#### XG Boost

In [79]:
# Define the classifier
clf_xgb = xgb.XGBClassifier(random_state=42, objective='binary:logistic')


param_grid_xgb = {
    'classification__n_estimators': [200,600],
    'classification__max_depth': [4,6,8,10,12],
    'classification__learning_rate': [0.001, 0.01, 0.1]}

gridSearch(clf_xgb, param_grid_xgb, name = "XGBoost" )

Best parameters found for {name}:  {'classification__learning_rate': 0.001, 'classification__max_depth': 8, 'classification__n_estimators': 200}
Best cross-validation score for {name}:  0.8379390773835217


##### Manual Tunning 

In [92]:
rbd = RandomForestClassifier(
    criterion ='gini',
    max_depth = 6,
    max_features = 'sqrt',
    max_leaf_nodes = 20,
    class_weight='balanced',
    n_estimators = 200
)

rbd.fit(X_train, y_train)

y_predXGB = rbd.predict(X_test)
accuracy = accuracy_score(y_test, y_predXGB)
f1 = f1_score(y_test, y_predXGB)
roc_auc = roc_auc_score(y_test, y_predXGB)
conf_matrix = confusion_matrix(y_test, y_predXGB)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'ROC-AUC: {roc_auc}')
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 0.8100558659217877
F1 Score: 0.7671232876712328
ROC-AUC: 0.8021879021879021
Confusion Matrix:
[[89 16]
 [18 56]]


#### Logistic Regression

In [89]:
# Define the classifier
clf_rl = LogisticRegression(random_state=42, class_weight="balanced", solver="liblinear")


param_grid_rl = {
    'classification__max_iter': [200, 250, 300, 500],
    'classification__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classification__solver': ['liblinear', 'saga'],  # Match solvers to penalties
    'classification__intercept_scaling': [1, 2, 5, 10],
    'classification__fit_intercept': [True, False]
    }

gridSearch(clf_rl, param_grid_rl, name = "Logistic Regression" )

1280 fits failed out of a total of 2560.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\A01280544\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\A01280544\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\A01280544\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fi

Best parameters found for  Logistic Regression : {'classification__fit_intercept': False, 'classification__intercept_scaling': 1, 'classification__max_iter': 200, 'classification__penalty': 'l2', 'classification__solver': 'liblinear'}
Best cross-validation score for  Logistic Regression : 0.8354256957034736


##### Manual Tunning

In [91]:
lgr = LogisticRegression(
    random_state=42, 
    class_weight='balanced', 
    solver='liblinear', 
    max_iter=500
)

lgr.fit(X_train, y_train)

y_predLR = lgr.predict(X_test)
accuracy = accuracy_score(y_test, y_predLR)
f1 = f1_score(y_test, y_predLR)
roc_auc = roc_auc_score(y_test, y_predLR)
conf_matrix = confusion_matrix(y_test, y_predLR)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'ROC-AUC: {roc_auc}')
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.8156424581005587
F1 Score: 0.7898089171974523
ROC-AUC: 0.818918918918919
Confusion Matrix:
[[84 21]
 [12 62]]


### Randomized Search

Random Forest

In [97]:
clf_rf = RandomForestClassifier(random_state=42, class_weight="balanced")

rfc_search_space = {
    'n_estimators': range(200, 1600),
    'criterion':['gini','log_loss','entropy'],
    'max_depth': range(2, 51),
    'min_samples_split': range(2, 11),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(estimator=clf_rf, param_distributions=rfc_search_space, n_iter=100, cv=5)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
rfc = RandomForestClassifier(**best_params)
print(best_params)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


KeyboardInterrupt: 

XG Boost

In [110]:
clf_xgb = xgb.XGBClassifier(random_state=42, class_weight="balanced")

xgb_search_space = {
    'n_estimators': range(200, 1600),
    'max_depth': range(2, 51),
    'min_samples_split': range(2, 11),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(estimator=clf_xgb, param_distributions=xgb_search_space, n_iter=100, cv=5)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
xgbc = xgb.XGBClassifier(**best_params)
print(best_params)

xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight", "max_features", "min_samples_split" } are not used.

Parameters: { "bootstrap", "class_weight

KeyboardInterrupt: 

Logistic Regression

In [118]:
clf_lgr = LogisticRegression(random_state=42, class_weight="balanced", solver="liblinear")

lgr_search_space = {
    'max_iter': (50, 500)
}


random_search = RandomizedSearchCV(estimator=clf_lgr, param_distributions=lgr_search_space, n_iter=100, cv=5)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
lgrc = LogisticRegression(**best_params)
print(best_params)

lgrc.fit(X_train, y_train)
y_pred = lgrc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

{'max_iter': 50}
Accuracy: 0.8044692737430168




Algo que estaba comentado...

In [None]:
'''rf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=10, random_state=30)
rf.fit(X_train, y_train)'''

In [None]:
'''y_predRF = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_predRF)
f1 = f1_score(y_test, y_predRF)
roc_auc = roc_auc_score(y_test, y_predRF)
conf_matrix = confusion_matrix(y_test, y_predRF)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'ROC-AUC: {roc_auc}')
print('Confusion Matrix:')
print(conf_matrix)'''