In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split

%matplotlib inline

sns.set()

**DATA**

In [7]:
test_PATH = '/content/titanic-test_b6547a1c-55fe-4409-8e04-67b6678214a4.csv'
train_PATH = '/content/titanic-train_4adcc656-193c-46d5-a978-f254fe4f0c22.csv'

test_df = pd.read_csv(test_PATH)
train_df = pd.read_csv(train_PATH)

print(train_df.shape)

train_df.head(3)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [0]:
train_df.Sex.value_counts().plot(kind = 'bar' , color = ['b' , 'r'])

plt.title('distribucion de sobrivivientes')

plt

**LIMPIEZA DE DATOS**

In [0]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

encoder_sex = label_encoder.fit_transform(train_df['Sex'])

train_df

In [0]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median()) # filtramos los datos nulos (fillna) en la seccion de Age

train_df['Embarked'] = train_df['Embarked'].fillna(train_df['S'])

In [30]:
train_predictors = train_df.drop(['PassengerId' , 'Survived' , 'Name' , 'Ticket' , 'Cabin'] , axis= 1) # quitamos estos datos que son irrelevantes

categorical_cols = [cname for cname in train_predictors.columns if
                          train_predictors[cname].nunique() < 10 and
                          train_predictors[cname].dtype == 'object'
                    ] # hagarramos las columnas que son de tipo categoricas

categorical_cols            

['Sex', 'Embarked']

In [31]:
numerical_cols = [cname for cname in train_predictors.columns if
                          train_predictors[cname].dtype in ['int64' , 'float64' ]
                    ]# hagarramos las columnas que son de tipo numericas 
                
numerical_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [34]:
my_cols = categorical_cols + numerical_cols

train_predictors = train_predictors[my_cols]

train_predictors.head(3)

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
0,male,S,3,22.0,1,0,7.25
1,female,C,1,38.0,1,0,71.2833
2,female,S,3,26.0,0,0,7.925


In [36]:
dummy_encoded_train_predictors = pd.get_dummies(train_predictors)
dummy_encoded_train_predictors.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1


In [37]:
train_df['Pclass'].value_counts() # asi vemos como esta dividida la informacion 

3    491
1    216
2    184
Name: Pclass, dtype: int64

**ALGORITMO**


In [39]:
y_target = train_df['Survived']
x_features_one = dummy_encoded_train_predictors.values

x_features_one

array([[ 3., 22.,  1., ...,  0.,  0.,  1.],
       [ 1., 38.,  1., ...,  1.,  0.,  0.],
       [ 3., 26.,  0., ...,  0.,  0.,  1.],
       ...,
       [ 3., 28.,  1., ...,  0.,  0.,  1.],
       [ 1., 26.,  0., ...,  1.,  0.,  0.],
       [ 3., 32.,  0., ...,  0.,  1.,  0.]])

In [0]:
x_train , x_validation , y_train , y_validation = train_test_split(x_features_one , y_target , test_size = .25 , random_state = 1)

**MODELO**

In [0]:
tree_one = tree.DecisionTreeClassifier()  #  ESTE ES EL MODELO 

tree_one = tree_one.fit(x_features_one , y_target)

In [42]:
tree_one_accuracy = round(tree_one.score(x_features_one , y_target) , 4) # asi vemos que tan bien esta haciendo la prediccion el modelo 
print('Accuracy : %0.4f' %(tree_one_accuracy))

Accuracy : 0.9798
