**LIBERIAS**

In [131]:
#De cajon
import numpy as np
import pandas as pd

#Imputar y verificar
from sklearn.impute import SimpleImputer
from scipy.stats import ks_2samp
from sklearn.impute import KNNImputer

#Estandarizar y pca
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

#Entrenar y metricas (randimizecsearchcv, gridsearchcv, roc_auc_score)
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

#Redes neuronales
from sklearn.neural_network import MLPClassifier

#Machines vector soporte
from sklearn.svm import SVC

#Arboles de decision
from sklearn.ensemble import RandomForestClassifier

#Discriminante
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

**CARGA Y LIMPIEZA**

In [132]:
# Cargar el dataset

df = pd.read_csv("titanic.csv")
print('Tamaño del dataset:', df.shape)
df.head(2)

Tamaño del dataset: (1310, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [133]:
# Dividir las variables en continuas y discretas

varc = list(df.describe().columns)
varc.remove('survived')

vard = list(set(df.columns) - set(varc))
vard.remove('survived')

print("Variables continuas: ", varc)
print("Variables discretas: ", vard)
print("Variable objetivo: survived")

Variables continuas:  ['pclass', 'age', 'sibsp', 'parch', 'fare', 'body']
Variables discretas:  ['ticket', 'boat', 'sex', 'cabin', 'embarked', 'name', 'home.dest']
Variable objetivo: survived


In [134]:
#EL genero podria ser una varibale interesante a tomar en cuenta en el modelo, por lo que se cambiara a numerica

df['sex'] = df['sex'].apply(lambda x:1 if x == 'male' else 0)
varc.append('sex') if 'sex' not in varc else None

In [135]:
# embarked podria ser una variable tambien interesante a tomar en cuenta en el modelo, por lo que se cambiara a numerica

df = df.dropna(subset=['survived'])
df['embarked'] = df['embarked'].apply(lambda x: 0 if x == 'C' else (1 if x == 'Q' else 2))
varc.append('embarked') if 'embarked' not in varc else None


In [136]:
# Verificar si hay valores nulos

df[varc].isnull().sum().sort_values(ascending=False) / df.shape[0]

body        0.907563
age         0.200917
fare        0.000764
pclass      0.000000
parch       0.000000
sibsp       0.000000
sex         0.000000
embarked    0.000000
dtype: float64

In [137]:
# Eliminar la columna ya que es asquerosa la cantidad de nulos que tiene

df.drop('body', axis=1, inplace=True) if 'body' in df.columns else None
df = df.dropna(subset=['survived'])
varc = list(set(varc) - set(['body']))

In [138]:
# Verificar si hay valores nulos

df[varc].isnull().sum().sort_values(ascending=False) / df.shape[0]

age         0.200917
fare        0.000764
parch       0.000000
pclass      0.000000
sibsp       0.000000
embarked    0.000000
sex         0.000000
dtype: float64

In [139]:
# Ver que opcion es mejor para imputar los valores nulos de la edad

x = df[['age']].copy()

im1 = SimpleImputer(strategy = 'mean')
im2 = SimpleImputer(strategy = 'median')
im3 = SimpleImputer(strategy = 'most_frequent')
im4 = KNNImputer(n_neighbors = 5)

x1 = pd.DataFrame(im1.fit_transform(x), columns = ['age'])
x2 = pd.DataFrame(im2.fit_transform(x), columns = ['age'])
x3 = pd.DataFrame(im3.fit_transform(x), columns = ['age'])
x4 = pd.DataFrame(im4.fit_transform(x), columns = ['age'])


ks = []

ks.append(ks_2samp(x.dropna(), x1).statistic)
ks.append(ks_2samp(x.dropna(), x2).statistic)
ks.append(ks_2samp(x.dropna(), x3).statistic)
ks.append(ks_2samp(x.dropna(), x4).statistic)

print(ks)

[array([0.10929409]), array([0.09796131]), array([0.12254768]), array([0.10929409])]


In [140]:
#La mejor opcion segun statistic seria con la moda pero tomar en cuneta que el pvalue en todos los casos marcaba que no se cumplia la Ho

df['age'] = x3['age']
df[varc].isnull().sum().sort_values(ascending=False) / df.shape[0]

fare        0.000764
sibsp       0.000000
parch       0.000000
pclass      0.000000
sex         0.000000
embarked    0.000000
age         0.000000
dtype: float64

In [141]:
# Se imputa con lo que sea, namas es un valor

print("Nulos de fare: ", df['fare'].isnull().sum())
x = df[['fare']].copy()
x1 = pd.DataFrame(im1.fit_transform(x), columns = ['fare'])

df['fare'] = x1['fare']

Nulos de fare:  1


In [142]:
# Verificar si hay valores nulos

df[varc].isnull().sum().sort_values(ascending=False) / df.shape[0]

parch       0.0
sibsp       0.0
pclass      0.0
sex         0.0
embarked    0.0
fare        0.0
age         0.0
dtype: float64

In [143]:
df[varc].dtypes


parch       float64
sibsp       float64
pclass      float64
sex           int64
embarked      int64
fare        float64
age         float64
dtype: object

In [144]:
#Pasar a enteros

df['sibsp'] = df['sibsp'].astype(int)
df['parch'] = df['parch'].astype(int)
df['pclass'] = df['pclass'].astype(int)

**MODELO**

In [145]:
X = df[varc].copy()
y = df['survived'].copy()

In [146]:
#Se escala

escalar = MinMaxScaler()
df_e = escalar.fit_transform(X)

In [147]:
# Las funciones para entrenar y testear los modelos obtenidos

def entrenar(param, modelo, X,y):
    grid = RandomizedSearchCV(param_distributions=param,
    n_jobs=-1,
    n_iter=100,
    cv=6,
    estimator=modelo,
    error_score='raise')
    
    grid.fit(X,y)
    
    return grid, grid.best_estimator_, grid.best_score_, grid.best_params_

def metricas(Xt, Xv, yt, yv, modelo):
    d = {'train':round(roc_auc_score(y_true=yt, y_score=modelo.predict_proba(Xt)[:,1]),3),
    'validate':round(roc_auc_score(y_true=yv, y_score=modelo.predict_proba(Xv)[:,1]),3)}

    print(d)
    
    return d

In [148]:
# Red neuronal, se aplica PCA y se ve que valores n mejor resultados de las componenetes.

param_mlpc = dict(hidden_layer_sizes = [(a,b,c,) for a in range(len(varc), len(varc)*2) for b in range(len(varc), len(varc)*2) for c in range(len(varc), len(varc)*2)],
             activation = ['identity', 'logistic', 'tanh', 'relu'],
             solver = ['lbfgs', 'sgd', 'adam'],
             alpha = np.arange(0.0001, 0.001, 0.0001),
             learning_rate = ['constant', 'invscaling', 'adaptive'])

for i in range(2,8):
    pca = PCA(n_components = i)
    df_pca = pd.DataFrame(pca.fit_transform(df_e))

    Xt, Xv, yt, yv = train_test_split(df_pca,y, train_size=0.7)

    modelo_mlpc = MLPClassifier()
    modelo_mlpc, best_estimator_mlpc, score_mlpc, params_mlpc = entrenar(param_mlpc, modelo_mlpc, Xt, yt)
    metricas(Xt,Xv,yt,yv,modelo_mlpc)
    print("Componentes: ", i)



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


{'train': np.float64(0.839), 'validate': np.float64(0.835)}
Componentes:  2


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


{'train': np.float64(0.866), 'validate': np.float64(0.85)}
Componentes:  3


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


{'train': np.float64(0.856), 'validate': np.float64(0.847)}
Componentes:  4
{'train': np.float64(0.852), 'validate': np.float64(0.847)}
Componentes:  5
{'train': np.float64(0.848), 'validate': np.float64(0.846)}
Componentes:  6
{'train': np.float64(0.846), 'validate': np.float64(0.897)}
Componentes:  7




In [149]:
# Maquina vector soporte, se aplica PCA y se ve que valores n mejor resultados de las componenetes.

param_svc = dict(
    C = np.arange(0.1, 2, 0.1),
    kernel = ['linear', 'poly', 'rbf', 'sigmoid'],
    degree = range(2, 6),
    gamma = ['scale', 'auto'],
    probability = [True]
)

for i in range(2,8):
    pca = PCA(n_components = i)
    df_pca = pd.DataFrame(pca.fit_transform(df_e))

    Xt, Xv, yt, yv = train_test_split(df_pca,y, train_size=0.7)

    modelo_svc = SVC()
    modelo_svc, best_estimator, score, params = entrenar(param_svc, modelo_svc, Xt, yt)
    metricas(Xt,Xv,yt,yv,modelo_svc)
    print("Componentes: ", i)



{'train': np.float64(0.817), 'validate': np.float64(0.815)}
Componentes:  2
{'train': np.float64(0.818), 'validate': np.float64(0.795)}
Componentes:  3
{'train': np.float64(0.801), 'validate': np.float64(0.766)}
Componentes:  4
{'train': np.float64(0.835), 'validate': np.float64(0.818)}
Componentes:  5
{'train': np.float64(0.847), 'validate': np.float64(0.825)}
Componentes:  6
{'train': np.float64(0.824), 'validate': np.float64(0.84)}
Componentes:  7


In [150]:
# Random forest, se aplica PCA y se ve que valores n mejor resultados de las componenetes.

param = dict(n_estimators=list(range(1, 100, 25)),
                                    criterion=['gini', 'entropy'],
                                    max_depth=[x for x in list(range(2, 5))] + [None],
                                    min_samples_split=[x for x in list(range(2, 4))],
                                    min_samples_leaf=[x for x in list(range(2, 4))],
                                    max_features=[None] + [i * .05 for i in list(range(2, 4))],
                                    max_leaf_nodes=list(range(2, 10)) + [None],
                                    min_impurity_decrease=[x * .10 for x in list(range(2, 4))],
                                    oob_score=[True,False],
                                    warm_start=[True, False],
                                    class_weight=[None, 'balanced'],
                                    max_samples=[None],)


for i in range(2,8):
    pca = PCA(n_components = i)
    df_pca = pd.DataFrame(pca.fit_transform(df_e))

    Xt, Xv, yt, yv = train_test_split(df_pca,y, train_size=0.7)

    modelo = RandomForestClassifier()
    modelo, best_estimator, score, params = entrenar(param, modelo, Xt, yt)
    metricas(Xt,Xv,yt,yv,modelo)
    print("Componentes: ", i)



{'train': np.float64(0.821), 'validate': np.float64(0.823)}
Componentes:  2


  warn(


{'train': np.float64(0.816), 'validate': np.float64(0.782)}
Componentes:  3


  warn(


{'train': np.float64(0.804), 'validate': np.float64(0.798)}
Componentes:  4
{'train': np.float64(0.788), 'validate': np.float64(0.756)}
Componentes:  5
{'train': np.float64(0.772), 'validate': np.float64(0.749)}
Componentes:  6


  warn(


{'train': np.float64(0.828), 'validate': np.float64(0.806)}
Componentes:  7


In [151]:
# Analisis del dicriminante, se aplica PCA y se ve que valores n mejor resultados de las componenetes.

param_lda = dict(solver = ['svd', 'lsqr', 'eigen'])

for i in range(2,8):
    pca = PCA(n_components = i)
    df_pca = pd.DataFrame(pca.fit_transform(df_e))

    Xt, Xv, yt, yv = train_test_split(df_pca,y, train_size=0.7)

    modelo = LinearDiscriminantAnalysis()
    modelo, best_estimator, score, params = entrenar(param_lda, modelo, Xt, yt)
    metricas(Xt,Xv,yt,yv,modelo)
    print("Componentes: ", i)

{'train': np.float64(0.822), 'validate': np.float64(0.822)}
Componentes:  2
{'train': np.float64(0.819), 'validate': np.float64(0.821)}
Componentes:  3
{'train': np.float64(0.838), 'validate': np.float64(0.809)}
Componentes:  4




{'train': np.float64(0.838), 'validate': np.float64(0.845)}
Componentes:  5
{'train': np.float64(0.854), 'validate': np.float64(0.804)}
Componentes:  6
{'train': np.float64(0.843), 'validate': np.float64(0.835)}
Componentes:  7




In [152]:
#AdaBoost, se aplica PCA y se ve que valores n mejor resultados de las componenetes.

param_adab = dict(n_estimators = range(2,10),
             learning_rate = np.arange(0.1,1,0.1),
             algorithm = ['SAMME'])

for i in range(2,8):
    pca = PCA(n_components = i)
    df_pca = pd.DataFrame(pca.fit_transform(df_e))

    Xt, Xv, yt, yv = train_test_split(df_pca,y, train_size=0.7)

    modelo_adab = AdaBoostClassifier()
    modelo_adab, best_estimator, score, params = entrenar(param_adab, modelo_adab, Xt, yt)
    metricas(Xt,Xv,yt,yv,modelo_adab)
    print("Componentes: ", i)



{'train': np.float64(0.761), 'validate': np.float64(0.761)}
Componentes:  2




{'train': np.float64(0.824), 'validate': np.float64(0.829)}
Componentes:  3




{'train': np.float64(0.766), 'validate': np.float64(0.749)}
Componentes:  4




{'train': np.float64(0.758), 'validate': np.float64(0.754)}
Componentes:  5




{'train': np.float64(0.763), 'validate': np.float64(0.753)}
Componentes:  6
{'train': np.float64(0.791), 'validate': np.float64(0.83)}
Componentes:  7




El mejor valor obtenido del modelo fue con la red neuronal con 7 componentes con la qu ese obtuvo:

train: 84.6 %
validate: 89.7 %