# Sesión 10: Introducción a Machine Learning - Supervivencia Titanic
## José David Mendoza Pérez
### Mayo, 2024
Tomado de Kaggle (https://www.kaggle.com/c/titanic)

### Módulos necesarios:
- Numpy
- Pandas
- Matplotlib
- Scikit-learn
- Seaborn

In [None]:
# Para instalar los módulos necesarios, por favor ejecute el siguiente código:
%pip install numpy pandas matplotlib scikit-learn seaborn

In [None]:
# Establecer nuestro entorno de trabajo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [141]:
# Cargar nuestros datos
titanic_data = pd.read_csv("train.csv")

### Análisis exploratorio de nuestros datos

In [None]:
# Primeras 5 filas
titanic_data.head()

In [None]:
# Últimas 5 filas
titanic_data.tail()

In [None]:
# Estadísticas de nuestros datos
titanic_data.describe()

In [None]:
# Mirar correlación entre los datos
# Seleccionar columnas que se puedan correlacionar
titanic_heatmap = titanic_data.select_dtypes(include=['int64', 'float64'])

In [None]:
import seaborn as sns
sns.heatmap(titanic_heatmap.corr(), cmap="YlGnBu")
plt.show()

In [142]:
# Otra forma
titanic_data.corr(numeric_only=True)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit as sss
# Vamos a escoger nuestro conjunto de entrenamiento y de testeo de forma estratificada
split = sss(n_splits=1, test_size=0.2)
for train_indices, test_indices in split.split(titanic_data, titanic_data[["Survived", "Pclass", "Sex"]]):
    strat_train_set = titanic_data.loc[train_indices]
    strat_test_set = titanic_data.loc[test_indices]

In [None]:
plt.subplot(1, 2, 1)

strat_train_set['Survived'].hist()
strat_train_set['Pclass'].hist()

plt.subplot(1, 2, 2)
strat_test_set['Survived'].hist()
strat_test_set['Pclass'].hist()

plt.show()

In [None]:
strat_train_set.info()

Como pueden ver, hay muchos valores faltantes en las columnas Age y Cabin. Para resolver esto, hay dos vías:
- 1. Eliminar los datos vacíos
- 2. Imputar los datos vacíos
Dado que nos interesa mantener la mayor cantidad de información posible, haremos una imputación de los datos vacíos.

Para transformar nuestra base de datos para adaptarla a un modelo de Machine Learning, vamos a adaptarla paso a paso (pueden pensarlo como un proceso de tuberías, donde vamos transformando el dataset parte por parte).

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y= None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy="mean")
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X

In [None]:
from sklearn.preprocessing import OneHotEncoder

class FeatureEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y= None):
        return self
    
    def transform(self, X):
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Embarked']]).toarray()

        column_names = ["C", "S", "Q", "N"]

        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]
        
        matrix = encoder.fit_transform(X[['Sex']]).toarray()
        
        column_names = ["Female", "Male"]

        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        return X

In [None]:
class FeatureDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(["Embarked", "Name", "Ticket", "Cabin", "Sex", "N"], axis = 1, errors = "ignore")

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([("ageimputer", AgeImputer()), 
                     ("featureencoder", FeatureEncoder()),
                     ("featuredropper", FeatureDropper())])

In [None]:
strat_train_set = pipeline.fit_transform(strat_train_set)

In [None]:
strat_train_set

In [None]:
strat_train_set.info()

In [None]:
from sklearn.preprocessing import StandardScaler

X = strat_train_set.drop(["Survived"],axis = 1)
y = strat_train_set["Survived"]

scaler = StandardScaler()
X_data = scaler.fit_transform(X)
y_data = y.to_numpy()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()

param_grid = [
    {"n_estimators": [10, 100, 200, 500], "max_depth": [None, 5, 10], "min_samples_split":[2, 3, 4]}
]

grid_search = GridSearchCV(clf, param_grid, cv = 3, scoring="accuracy", return_train_score=True)
grid_search.fit(X_data, y_data)

In [None]:
final_clf = grid_search.best_estimator_

In [None]:
final_clf

In [None]:
strat_test_set = pipeline.fit_transform(strat_test_set)

In [None]:
strat_test_set

In [None]:
X_test = strat_test_set.drop(["Survived"], axis = 1)
y_test = strat_test_set["Survived"]

scaler = StandardScaler()
X_data_test = scaler.fit_transform(X_test)
y_data_test = y_test.to_numpy()

In [None]:
final_clf.score(X_data_test, y_data_test)

In [None]:
final_titanic_data = pipeline.fit_transform(titanic_data)

In [None]:
final_titanic_data

In [None]:
X_final = final_titanic_data.drop(["Survived"], axis = 1)
y_final = final_titanic_data["Survived"]

scaler = StandardScaler()
X_data_final = scaler.fit_transform(X_final)
y_data_final = y_final

In [None]:
prod_clf = RandomForestClassifier()

param_grid = [
    {"n_estimators": [10, 100, 200, 500], "max_depth": [None, 5, 10], "min_samples_split":[2, 3, 4]}
]

grid_search = GridSearchCV(prod_clf, param_grid, cv = 3, scoring="accuracy", return_train_score=True)
grid_search.fit(X_data_final, y_data_final)

In [None]:
prod_final_clf = grid_search.best_estimator_

In [None]:
prod_final_clf

In [None]:
titanic_test_data = pd.read_csv("test.csv")
final_test_data = pipeline.fit_transform(titanic_test_data)

In [None]:
final_test_data.info()

En la columna "Fare" (tarifa del tiquete) hay un valor vacío, así que lo imputaremos.

In [None]:
X_final_test = final_test_data
X_final_test = X_final_test.ffill()

scaler = StandardScaler()
X_data_final_test = scaler.fit_transform(X_final_test)

In [None]:
predictions = prod_final_clf.predict(X_data_final_test)

In [None]:
predictions

In [135]:
final_df = pd.DataFrame(titanic_test_data["PassengerId"])
final_df["Survived"] = predictions
final_df.to_csv("predictions.csv", index=False)

In [136]:
final_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
