In [1]:
import pandas as pd 
pd.set_option("display.max_columns",30)
pd.set_option("display.max_rows",30)
import matplotlib.pyplot as plt 
import cufflinks as cf 
cf.go_offline()
import numpy as np

### imputacion variables continuas
from sklearn.impute import SimpleImputer
from scipy.stats import ks_2samp

In [2]:
df = pd.read_csv("datasets/titanic.csv")

In [3]:
df.shape

(1310, 14)

In [4]:
df.survived.value_counts(True)

0.0    0.618029
1.0    0.381971
Name: survived, dtype: float64

In [5]:
print(df.head(32))

    pclass  survived                                             name     sex  \
0      1.0       1.0                    Allen, Miss. Elisabeth Walton  female   
1      1.0       1.0                   Allison, Master. Hudson Trevor    male   
2      1.0       0.0                     Allison, Miss. Helen Loraine  female   
3      1.0       0.0             Allison, Mr. Hudson Joshua Creighton    male   
4      1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   
..     ...       ...                                              ...     ...   
27     1.0       1.0          Bishop, Mrs. Dickinson H (Helen Walton)  female   
28     1.0       1.0                           Bissette, Miss. Amelia  female   
29     1.0       1.0        Bjornstrom-Steffansson, Mr. Mauritz Hakan    male   
30     1.0       0.0                     Blackwell, Mr. Stephen Weart    male   
31     1.0       1.0                                 Blank, Mr. Henry    male   

        age  sibsp  parch  

In [6]:
df.survived.value_counts(True)

0.0    0.618029
1.0    0.381971
Name: survived, dtype: float64

In [7]:
def completitud_datos_nulos(df):
    return df.isnull().sum().sort_values(ascending=False) / df.shape[0]

In [8]:
completitud_datos_nulos(df)

body         0.907634
cabin        0.774809
boat         0.629008
home.dest    0.431298
age          0.201527
embarked     0.002290
fare         0.001527
pclass       0.000763
survived     0.000763
name         0.000763
sex          0.000763
sibsp        0.000763
parch        0.000763
ticket       0.000763
dtype: float64

In [9]:
df = df[(~df["survived"].isnull()) & ~df["pclass"].isnull() & ~df["fare"].isnull() ]

In [10]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [11]:
def complete_continuous_variables(df,col,strategy='median'):
    X = df[col].copy()
    im = SimpleImputer(strategy=strategy)
    Xi = pd.DataFrame(im.fit_transform(X),columns=col)
    l_ks = []
    for v in col:
        l_ks.append([v,ks_2samp(X[v].dropna(),Xi[v]).statistic])
    ks = pd.DataFrame(l_ks,columns=['feat','ks'])
    #print(ks)
    print((ks.ks>=0.1).sum())
    df[col] = im.transform(df[col].copy())
    return df

In [12]:
complete_continuous_variables(df,["body"])

1


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,154.0,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,154.0,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,154.0,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,154.0,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,154.0,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,154.0,


In [13]:
completitud_datos_nulos(df)

cabin        0.774465
boat         0.628440
home.dest    0.430428
age          0.201070
embarked     0.001529
pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000000
body         0.000000
dtype: float64

In [14]:
varc = list(df.describe())

In [15]:
varc = [x for x in varc if x not in 'survived']

In [16]:
vard = [x for x in df.columns if x not in varc+['survived']]

In [17]:
tgt = 'survived'

In [18]:
df[tgt].value_counts()

0.0    808
1.0    500
Name: survived, dtype: int64

# Modelacion clasificacion

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [20]:
X = df[varc].copy() #TAD "Tabla Analitica de Datos"

In [21]:
y = df[tgt].copy() #variable objetivo

In [22]:
sc = MinMaxScaler()
Xs = pd.DataFrame(sc.fit_transform(X), columns=varc)

In [23]:
y = y.astype(int)

In [24]:
Xt, Xv, yt, yv = train_test_split(Xs,y, train_size=0.7)

In [25]:
def entrenar(param, modelo, X,y):
    grid = RandomizedSearchCV(param_distributions=param,
                             n_jobs=-1,
                             n_iter=10,
                             cv=4,
                             estimator=modelo,
                             error_score='raise')
    grid.fit(X,y)
    return grid, grid.best_estimator_, grid.best_score_, grid.best_params_

In [26]:
def metricas(Xt, Xv, yt, yv, modelo):
    d = {'train':round(roc_auc_score(y_true=yt, y_score=modelo.predict_proba(Xt)[:,1]),3),
         'validate':round(roc_auc_score(y_true=yv, y_score=modelo.predict_proba(Xv)[:,1]),3)
        }
    return d

# Red Neuronal

In [27]:
param_mlpc = dict(hidden_layer_sizes = [(a,b,c,) for a in range(len(varc), len(varc)*2) for b in range(len(varc), len(varc)*2) for c in range(len(varc), len(varc)*2)],
             activation = ['identity', 'logistic', 'tanh', 'relu'],
             solver = ['lbfgs', 'sgd', 'adam'],
             alpha = np.arange(0.0001, 0.001, 0.0001),
             learning_rate = ['constant', 'invscaling', 'adaptive'])

In [28]:
modelo_mlpc = MLPClassifier()
modelo_mlpc, best_estimator_mlpc, score_mlpc, params_mlpc = entrenar(param_mlpc, modelo_mlpc, Xt, yt)
metricas(Xt,Xv,yt,yv,modelo_mlpc)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
import pickle

In [None]:
filename = './redneural.pkl'
with open(filename, 'wb') as file:
    pickle.dump(modelo_mlpc,file)
    
print(f'Modelo guardado en {filename}')

In [None]:
pwd

In [None]:
params_mlpc

In [None]:
score_mlpc

In [None]:
best_estimator_mlpc

In [None]:
modelo_mlpc

# Random Forest

In [None]:
param = dict(n_estimators = range(2,10),
             max_depeth = range(2,6),
             max_features = range(2,len(varc)),
             criterion = ['gini', 'entropy'])

In [None]:
param = dict(n_estimators=list(range(1, 100, 25)),
                                    criterion=['gini', 'entropy'],
                                    max_depth=[x for x in list(range(2, 5))] + [None],
                                    min_samples_split=[x for x in list(range(2, 4))],
                                    min_samples_leaf=[x for x in list(range(2, 4))],
                                    max_features=[None] + [i * .05 for i in list(range(2, 4))],
                                    max_leaf_nodes=list(range(2, 10)) + [None],
                                    min_impurity_decrease=[x * .10 for x in list(range(2, 4))],
                                    oob_score=[True,False],
                                    warm_start=[True, False],
                                    class_weight=[None, 'balanced'],
                                    max_samples=[None],)

In [None]:
modelo = RandomForestClassifier()
modelo, best_estimator, score, params = entrenar(param, modelo, Xt, yt)
metricas(Xt,Xv,yt,yv,modelo)

# Ada Boost

In [None]:
param_adab = dict(n_estimators = range(2,10),
             learning_rate = np.arange(0.1,1,0.1),
             algorithm = ['SAMME.R'])

In [None]:
modelo_adab = AdaBoostClassifier()
modelo_adab, best_estimator, score, params = entrenar(param_adab, modelo_adab, Xt, yt)
metricas(Xt,Xv,yt,yv,modelo_adab)

# Analisis Discriminante

In [None]:
param_lda = dict(solver = ['svd', 'lsqr', 'eigen'])

In [None]:
modelo = LinearDiscriminantAnalysis()
modelo, best_estimator, score, params = entrenar(param_lda, modelo, Xt, yt)
metricas(Xt,Xv,yt,yv,modelo)

# Maquina Vector Soporte

In [None]:
param_svc = dict(C = np.arange(0,2,0.1),
             kernel = ['linear','poly','rbf','sigmoid'],
             degree = range(2,6),
             gamma = ['scale','auto'], 
             probability = [True])

In [None]:
modelo_svc = SVC()
modelo_svc, best_estimator, score, params = entrenar(param_svc, modelo_svc, Xt, yt)
metricas(Xt,Xv,yt,yv,modelo_svc)