In [92]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [93]:
def clean_data(data,columns):

    data_clean = data.drop(columns,axis = 1)
    return data_clean

columns = ['Ticket','Cabin','Embarked']

data_clean = clean_data(train_data,columns)
data_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05


In [94]:
#Creating the median age for the unknown values 

pd.isnull(data_clean['Age']).sum() #The number of indidivuals with unknown age
data_clean['Age'].fillna(data_clean['Age'].median(),inplace= True) #The median age

In [95]:
pd.isnull(data_clean).sum() == 0 #All the values have been thus assigned

PassengerId    True
Survived       True
Pclass         True
Name           True
Sex            True
Age            True
SibSp          True
Parch          True
Fare           True
dtype: bool

In [98]:
replacement_values = {'male': 1, 
                      'female': 0}
data_clean['Sex'] = data_clean['Sex'].replace(replacement_values)

data_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05


In [99]:
titanic_model = DecisionTreeRegressor(random_state=1)

In [112]:
columns = ['Sex','Fare','Age']
X = data_clean[columns]
X.describe()
Y = data_clean['Survived']

titanic_model.fit(X,Y)

In [115]:
print('Survival prediction for the following:')
print(X.head())
titanic_model.predict(X.head(10))

Survival prediction for the following:
   Sex     Fare   Age
0    1   7.2500  22.0
1    0  71.2833  38.0
2    0   7.9250  26.0
3    0  53.1000  35.0
4    1   8.0500  35.0


array([0., 1., 1., 1., 0., 0., 0., 0., 1., 1.])

In [122]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_Y, val_Y = train_test_split(X,Y,random_state=1)

titanic_model_split = DecisionTreeRegressor(random_state= 2)

titanic_model_split.fit(train_X,train_Y)
titanic_model_split.predict(train_X.head())

array([0., 0., 1., 1., 0.])

In [117]:
from sklearn.metrics import mean_absolute_error

predicted_values = titanic_model.predict(X)
mean_absolute_error(Y,predicted_values)

0.026482558469090454

In [114]:
Women = train_data[train_data['Sex']=='female']["Survived"]
rate_women = sum(Women)/len(Women)

women_sentence = "Percentage of women who survived: {}".format(rate_women*100)
print(women_sentence)

Percentage of women who survived: 74.20382165605095


In [79]:
Men = train_data[train_data['Sex']=='male']['Survived']
rate_men = sum(Men)/len(Men)
men_sentence = "Percentage of men who survived: {}".format(rate_men*100)
print(men_sentence)

Percentage of men who survived: 18.890814558058924
