In [39]:
# Importación de librerías
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [50]:
# Importación de datos
url = "https://raw.githubusercontent.com/IvTole/MachineLearning_InferenciaBayesiana_CUGDL/refs/heads/main/data/titanic/titanic_clean.csv"
df_titanic = pd.read_csv(url)
df_titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked_Q,embarked_S,familysize,isalone,title_Miss,title_Mr,title_Mrs,title_Rare,fare_log,familysize_log
0,1,1,"Allen, Miss. Elisabeth Walton",0,29.0,0,0,24160,211.3375,0,1,1,1,1,0,0,0,5.358177,0.693147
1,1,1,"Allison, Master. Hudson Trevor",1,0.92,1,2,113781,151.55,0,1,4,0,0,0,0,0,5.027492,1.609438
2,1,0,"Allison, Miss. Helen Loraine",0,2.0,1,2,113781,151.55,0,1,4,0,1,0,0,0,5.027492,1.609438
3,1,0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1,2,113781,151.55,0,1,4,0,0,1,0,0,5.027492,1.609438
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1,2,113781,151.55,0,1,4,0,0,0,1,0,5.027492,1.609438


In [15]:
# Dividir matriz de características y variable objetivo
X = df_titanic.drop(columns=["survived","fare","familysize","name","ticket"])
y = df_titanic["survived"]

In [51]:
# Estandarizar datos
scaler = MinMaxScaler()

X_std = scaler.fit_transform(X=X)
X_std = pd.DataFrame(data=X_std, columns=X.columns)
X_std

Unnamed: 0,pclass,sex,age,sibsp,parch,embarked_Q,embarked_S,isalone,title_Miss,title_Mr,title_Mrs,title_Rare,fare_log,familysize_log
0,0.0,0.0,0.361142,0.000,0.000000,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.858556,0.000000
1,0.0,1.0,0.009395,0.125,0.222222,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.805569,0.511392
2,0.0,0.0,0.022924,0.125,0.222222,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.805569,0.511392
3,0.0,1.0,0.373669,0.125,0.222222,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.805569,0.511392
4,0.0,0.0,0.311036,0.125,0.222222,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.805569,0.511392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,1.0,0.0,0.179506,0.125,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.438698,0.226294
1304,1.0,0.0,0.348616,0.125,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.438698,0.226294
1305,1.0,1.0,0.329826,0.000,0.000000,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.337639,0.000000
1306,1.0,1.0,0.336089,0.000,0.000000,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.337639,0.000000


In [52]:
# Crear las componentes principales
print("Numero de variables: ", X_std.shape[1])

# Elegimos numero de componentes principales
n_components = 6

# Instanciamos método de sklearn
pca = PCA(n_components=n_components)

# Aplicamos método
X_pca_array = pca.fit_transform(X_std)

# Pasamos otra vez a dataframe
X_pca = pd.DataFrame()
for i in range(0,n_components):
    X_pca["Component " + str(i+1)] = X_pca_array[:,i]

# Imprimimos nuevo dataframe
display(X_pca)

# Varianza explicada
print('Porcentaje de varianza explicada: ', round(pca.explained_variance_ratio_.sum()*100,2))

Numero de variables:  14


Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6
0,-0.815581,0.513246,0.404329,-0.886250,-0.552863,0.059998
1,-0.384392,-0.832218,-0.106353,-0.099557,-0.555450,0.301218
2,-1.253756,-0.215088,0.255448,-0.218475,-0.897329,0.054336
3,0.212638,-0.951360,-0.257186,-0.121062,-0.607937,0.174953
4,-1.199796,-0.885119,-0.064375,-0.460298,0.300039,0.176823
...,...,...,...,...,...,...
1303,-1.157803,0.496915,-0.164493,0.632900,-0.359818,-0.487844
1304,-1.154301,0.490403,-0.179593,0.605141,-0.348467,-0.467731
1305,0.698161,0.445826,-0.464400,0.251829,0.216491,-0.415286
1306,0.698291,0.445585,-0.464959,0.250801,0.216912,-0.414541


Porcentaje de varianza explicada:  93.36


## Aplicar modelos ML

In [53]:
# Split de datos de entrenamiento y validación

X_train, X_test, X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_std,
                                                                             X_pca,
                                                                             y,
                                                                             test_size=0.2,
                                                                             random_state=42)

In [54]:
# Instanciamos modelo
rf = RandomForestClassifier(n_estimators=100,
                            max_depth=6,
                            min_samples_split=30,
                            min_samples_leaf=8)
rf_pca = RandomForestClassifier(n_estimators=100,
                                max_depth=6,
                                min_samples_split=30,
                                min_samples_leaf=8)

# Entrenamos
rf.fit(X_train, y_train)
rf_pca.fit(X_train_pca, y_train)

# Predicciones
y_pred = rf.predict(X_test)
y_pred_pca = rf_pca.predict(X_test_pca)

# Métricas
acc = accuracy_score(y_pred, y_test)
acc_pca = accuracy_score(y_pred_pca, y_test)

print(f"Accuracy (original): {round(acc,3)} % ")
print(f"Accuracy (pca): {round(acc_pca,3)} %")

Accuracy (original): 0.782 % 
Accuracy (pca): 0.779 %
