In [40]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
# train_test_split: Para dividir el dataset en conjuntos de entrenamiento y validación.
# Uso: Asegura que el modelo se evalúe en datos no vistos durante el entrenamiento.
# GridSearchCV: Para la optimización de hiperparámetros de nuestro modelo y/o pipeline.
# Uso: Busca sistemáticamente la mejor combinación de parámetros.

from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
# StandardScaler: Para estandarizar (escalar) las características numéricas.
# Uso: Transforma los datos para que tengan media 0 y desviación estándar 1, útil para muchos algoritmos.
# OneHotEncoder: Para convertir características categóricas (como 'Sex', 'Embarked') a formato numérico binario.
# Uso: Permite que los modelos de ML trabajen con datos categóricos.
# FunctionTransformer: Para integrar funciones personalizadas (como tus funciones de ingeniería de características)
# Uso: Te permite usar tus funciones Python dentro del pipeline de Scikit-learn.

from sklearn.impute import SimpleImputer
# Uso: Para manejar los valores faltantes. Por ejemplo, rellenar 'Age' o 'Fare' con la media.

from sklearn.compose import ColumnTransformer
# Uso: Permite aplicar diferentes transformaciones a diferentes columnas del DataFrame.
#      Es clave para nuestro pipeline, ya que tenemos columnas numéricas y categóricas.

from sklearn.pipeline import Pipeline
# Uso: La clase principal para construir tu pipeline. Encadena todos los pasos de preprocesamiento y el modelo.
#      Esto garantiza que todas las transformaciones se apliquen consistentemente.

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
# Uso: El algoritmo de Machine Learning que usaremos para nuestro modelo de clasificación binaria.
#      Es un clasificador de conjunto robusto y de alto rendimiento.

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Uso: Para evaluar el rendimiento de nuestro modelo.
# accuracy_score: La métrica principal para el concurso de Kaggle.
# confusion_matrix: Muestra la cantidad de verdaderos positivos, negativos, falsos positivos y negativos.
# classification_report: Proporciona precisión (precision), recall (sensibilidad) y F1-score por clase.

# Librerías para visualización de datos (aunque en este notebook será más para mostrar resultados)
import matplotlib.pyplot as plt
import seaborn as sns
# Uso: Para crear gráficos, como la matriz de confusión, para visualizar y entender los resultados del modelo.

# Configuración para gráficos
sns.set_style("whitegrid")
plt.style.use("seaborn-v0_8-darkgrid")
import numpy as np
# Uso: Ajustes estéticos para que los gráficos se vean bien.

In [41]:
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-darkgrid')

In [55]:
lista = np.array([1,2,3,4,5,6])
res_list = lista.reshape(-1, 4)
res_list

ValueError: cannot reshape array of size 6 into shape (4)

In [43]:
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df_join_raw = pd.concat([df_train_raw.drop('Survived', axis=1), df_test_raw], ignore_index=True)
temp_df = df_join_raw.copy()
print(f'Dimensiones iniciales: {df_join_raw.shape}')
print(f'Columnas iniciales: {df_join_raw.columns.to_list}')

Dimensiones iniciales: (1309, 11)
Columnas iniciales: <bound method IndexOpsMixin.tolist of Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')>


In [44]:
df_join_raw

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [45]:

def get_title_from_name(df):
    df = df.copy()
    def extract_title(name):
        title_extracted = re.search(' ([A-Za-z]+)\\.', name)
        if title_extracted:
            return title_extracted.group(1)
        return 'rare'
    df['Title'] = df['Name'].apply(extract_title)
    df = df.drop('Name', axis=1)
    return df

def classify_titles(df):
    df = df.copy()
    def classify(title):
        if title in ('Countess', 'Lady', 'Jonkheer'):#3
            return 3
        elif title in ('Miss', 'Mrs', 'Mme', 'Mlle', 'Ms', 'Master', 'Dona'):#7
            return 2
        elif title in ('Dr', 'Rev', 'Col', 'Capt', 'Sir', 'Major', 'Don'):#7
            return 1
        return 0 #('Mr', 'rare')
    df['Title'] = df['Title'].apply(classify)
    return df


def get_deck_from_cabin(df):
    df = df.copy()
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Deck'] = df['Cabin'].transform(lambda x: x[0])
    df = df.drop('Cabin', axis=1)
    return df

def get_agency_ticket_numbers_from_ticket(df):
    df = df.copy()
    def extract_prefix(ticket):
        prefix_extracted = re.match(r'([A-Za-z\./]+)', ticket)
        if prefix_extracted:
            return prefix_extracted.group(1).replace('.', '').replace('/', '').upper()
        return 'NO_AGENCY'
    df['Agency'] = df['Ticket'].apply(extract_prefix)
    ticket_counts = df['Ticket'].value_counts()
    df['TicketNumber'] = df['Ticket'].map(ticket_counts)
    df = df.drop('Ticket', axis=1)
    return df

def familysize_isAlone_from_sibsp_parch(df):
    df = df.copy()
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df = df.drop(['Parch', 'SibSp', 'PassengerId'], axis=1)
    return df

def reorgize_ticket_class(df):
    df = df.copy()
    def condition_class(ticket_class):
        class_mapping = { 1: 2,  2: 1, 3: 0}
        return class_mapping.get(ticket_class, 0)
    df['Pclass'] = df['Pclass'].apply(condition_class)
    return df

def age_imputed_from_title_pclass(df):
    df = df.copy()
    df['Age'] = df.groupby(['Pclass', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))
    df['Age'] = temp_df['Age'].fillna(temp_df['Age'].median())
    return df

feature_engineering_pipeline = Pipeline([
    ('get_title', FunctionTransformer(get_title_from_name, validate=False)),
    ('classify_title', FunctionTransformer(classify_titles, validate=False)),
    ('get_deck', FunctionTransformer(get_deck_from_cabin, validate=False)),
    ('get_agency', FunctionTransformer(get_agency_ticket_numbers_from_ticket, validate=False)),
    ('familySize', FunctionTransformer(familysize_isAlone_from_sibsp_parch, validate=False)),
    ('reorganize_pclass', FunctionTransformer(reorgize_ticket_class, validate=False)),
    ('imputer_custom_age', FunctionTransformer(age_imputed_from_title_pclass, validate=False))
])
feature_engineering_pipeline.set_output(transform='pandas')
print('Cargado transformadores de ingenieria de features')

Cargado transformadores de ingenieria de features


In [None]:
numeric_features = ['Age', 'Fare', 'TicketNumber', 'FamilySize', 'Pclass', 'Title']
numeric_transformer = Pipeline(steps=[
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['Embarked', 'Deck', 'Agency', 'Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

preprocessor.set_output(transform='pandas')
print('preprocessor created')

preprocessor created


In [None]:
def create_pipeline():
    pipeline = Pipeline(steps=[
        ('feature_engineering', feature_engineering_pipeline),
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
    ])
    return pipeline


In [47]:
full_pipeline = Pipeline(steps=[
    ('features_engineering', feature_engineering_pipeline),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
print('defined full pipelines')

defined full pipelines


In [48]:
X = df_train_raw.drop(['Survived'], axis=1)
y = df_train_raw['Survived']
X_test_final = df_test_raw.copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [49]:
x_demos = full_pipeline.named_steps['features_engineering'].fit_transform(temp_df)
""" x_demos = full_pipeline.named_steps['preprocessor'].fit_transform(x_demos) """
x_demos

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Agency,TicketNumber,FamilySize,IsAlone
0,0,male,22.0,7.2500,S,0,U,A,1,2,0
1,2,female,38.0,71.2833,C,2,C,PC,2,2,0
2,0,female,26.0,7.9250,S,2,U,STONO,1,1,1
3,2,female,35.0,53.1000,S,2,C,NO_AGENCY,2,2,0
4,0,male,35.0,8.0500,S,0,U,NO_AGENCY,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
1304,0,male,28.0,8.0500,S,0,U,A,1,1,1
1305,2,female,39.0,108.9000,C,2,C,PC,3,1,1
1306,0,male,38.5,7.2500,S,0,U,SOTONOQ,1,1,1
1307,0,male,28.0,8.0500,S,0,U,NO_AGENCY,1,1,1


In [50]:
""" full_pipeline.fit(X_train, y_train)
y_pred_val_pipeline = full_pipeline.predict(X_val)

acurrancy_pipeline = accuracy_score(y_val, y_pred_val_pipeline)
print(f'Accuracy en el conjunto de validacion {acurrancy_pipeline}')
print(confusion_matrix(y_val, y_pred_val_pipeline))
print(classification_report(y_val, y_pred_val_pipeline, target_names=['No Sobrevive (0)', 'Sobrevive (1)'])) """

" full_pipeline.fit(X_train, y_train)\ny_pred_val_pipeline = full_pipeline.predict(X_val)\n\nacurrancy_pipeline = accuracy_score(y_val, y_pred_val_pipeline)\nprint(f'Accuracy en el conjunto de validacion {acurrancy_pipeline}')\nprint(confusion_matrix(y_val, y_pred_val_pipeline))\nprint(classification_report(y_val, y_pred_val_pipeline, target_names=['No Sobrevive (0)', 'Sobrevive (1)'])) "

In [51]:
""" final_test_predictions = full_pipeline.predict(X_test_final)
submission_df = pd.DataFrame({
    'PassengerId': df_test_raw['PassengerId'],
    'Survived': final_test_predictions
})
submission_df.to_csv('../data/result/submission.csv', index=False) """

" final_test_predictions = full_pipeline.predict(X_test_final)\nsubmission_df = pd.DataFrame({\n    'PassengerId': df_test_raw['PassengerId'],\n    'Survived': final_test_predictions\n})\nsubmission_df.to_csv('../data/result/submission.csv', index=False) "

In [52]:
""" param_grid_pipeline = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2'],
} """
param_grid = [
    {
        'classifier': [RandomForestClassifier(random_state=42)], 
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2],
        'classifier__max_features': ['sqrt', 'log2']
    },
    {
        'classifier': [GradientBoostingClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__max_depth': [3, 5],
        'classifier__subsample': [0.8, 1.0]
    },
    {
        'classifier': [LogisticRegression(random_state=42, solver='liblinear')],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l1', 'l2']
    }
]

""" grid_search_pipeline = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='accuracy',
                                    n_jobs=-1, verbose=2)
grid_search_pipeline.fit(X_train, y_train)

print(grid_search_pipeline.best_params_)
print(f"Mejor Accuracy en CV (Pipeline): {grid_search_pipeline.best_score_:.4f}")
best_pipeline_model = grid_search_pipeline.best_estimator_
print(f"\nEl mejor clasificador encontrado es: {best_pipeline_model.named_steps['classifier']}")
y_pred_best_pipeline = best_pipeline_model.predict(X_val)
accuracy_best_pipeline = accuracy_score(y_val, y_pred_best_pipeline)
print(f"Accuracy del MEJOR Pipeline en X_val: {accuracy_best_pipeline:.4f}")
print(confusion_matrix(y_val, y_pred_best_pipeline))

final_test_predictions = best_pipeline_model.predict(X_test_final)
submission_df = pd.DataFrame({
    'PassengerId': df_test_raw['PassengerId'],
    'Survived': final_test_predictions
})
submission_df.to_csv('../data/result/submission.csv', index=False) """

' grid_search_pipeline = GridSearchCV(full_pipeline, param_grid, cv=5, scoring=\'accuracy\',\n                                    n_jobs=-1, verbose=2)\ngrid_search_pipeline.fit(X_train, y_train)\n\nprint(grid_search_pipeline.best_params_)\nprint(f"Mejor Accuracy en CV (Pipeline): {grid_search_pipeline.best_score_:.4f}")\nbest_pipeline_model = grid_search_pipeline.best_estimator_\nprint(f"\nEl mejor clasificador encontrado es: {best_pipeline_model.named_steps[\'classifier\']}")\ny_pred_best_pipeline = best_pipeline_model.predict(X_val)\naccuracy_best_pipeline = accuracy_score(y_val, y_pred_best_pipeline)\nprint(f"Accuracy del MEJOR Pipeline en X_val: {accuracy_best_pipeline:.4f}")\nprint(confusion_matrix(y_val, y_pred_best_pipeline))\n\nfinal_test_predictions = best_pipeline_model.predict(X_test_final)\nsubmission_df = pd.DataFrame({\n    \'PassengerId\': df_test_raw[\'PassengerId\'],\n    \'Survived\': final_test_predictions\n})\nsubmission_df.to_csv(\'../data/result/submission.csv\',