In [354]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

train_data = pd.read_csv('train.csv')


In [355]:
def clean_data(data,columns):
    #First we remove the columns we don't need

    data_clean = data.drop(columns,axis = 1)
    return data_clean

columns = ['Ticket','Cabin','Embarked']

data_clean = clean_data(train_data,columns)
data_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05


In [357]:
#Creating the median age for the unknown values 

pd.isnull(data_clean['Age']).sum() #The number of indidivuals with unknown age
data_clean['Age'].fillna(data_clean['Age'].median(),inplace= True) #The median age

In [358]:
pd.isnull(data_clean).sum() == 0 #All the values have been thus assigned

PassengerId    True
Survived       True
Pclass         True
Name           True
Sex            True
Age            True
SibSp          True
Parch          True
Fare           True
dtype: bool

In [359]:
replacement_values = {'male': 1, 
                      'female': 0}
data_clean['Sex'] = data_clean['Sex'].replace(replacement_values)

data_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05


In [360]:
titanic_model = DecisionTreeRegressor(random_state=1)

In [361]:
columns = ['Sex','Fare','Age','Pclass']
X = data_clean[columns]
X.describe()
Y = data_clean['Survived']

titanic_model.fit(X,Y)

In [362]:
print('Survival prediction for the following:')
print(X.head())
titanic_model.predict(X.head(10))

Survival prediction for the following:
   Sex     Fare   Age  Pclass
0    1   7.2500  22.0       3
1    0  71.2833  38.0       1
2    0   7.9250  26.0       3
3    0  53.1000  35.0       1
4    1   8.0500  35.0       3


array([0., 1., 1., 1., 0., 0., 0., 0., 1., 1.])

### Splitting the training data into a test set and training set

In [363]:
from sklearn.model_selection import train_test_split

#So in this particular case we're splitting up the training data 
#THis is important as we can train the model and then compare the results to the test half of the data 

train_X, val_X, train_Y, val_Y = train_test_split(X,Y,random_state=1, test_size=0.25, shuffle = True) #Split the model into Training data and test

### Using the split data to work out the perfect number of tree nodes! Finding sweet spot

In [364]:
def get_mae(max_leaf_nodes,train_X,val_X,train_Y,val_Y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state = 1)
    model.fit(train_X,train_Y)
    pred_val = model.predict(val_X)
    mae = mean_absolute_error(val_Y,pred_val)
    return mae

In [365]:
nodes = [5,25,50,100,150,200,300,400,500,1000]
mae_values = []
for node in nodes:
    mae_values.append(get_mae(node,train_X,val_X,train_Y,val_Y))

print(mae_values)

[0.29673990728863237, 0.2503315924492668, 0.23150873618246434, 0.24253498675644924, 0.24170403587443948, 0.24170403587443948, 0.24170403587443948, 0.24170403587443948, 0.24170403587443948, 0.24170403587443948]


## Using random forests

In [366]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)

rf_model.fit(train_X,train_Y)

mean_diff = mean_absolute_error(rf_model.predict(val_X),val_Y)
print("The mean error for this Random Forest: {}".format(mean_diff))


The mean error for this Random Forest: 0.2509414969684028


### Applying this to the entire data set

In [367]:
rf_model.fit(X,Y)

survival_predictions = rf_model.predict(X)
#We need to round the predicted_values to the nearest integer

survival_predictions = np.round(survival_predictions)

#I want to present the total values that this model predicts correctly!

def predict_values(true_values,predicted_values):
    correct = 0
    incorrect = 0
    for i in range(len(true_values)):
        if true_values[i] == predicted_values[i]:
         correct += 1
        else:
           incorrect += 1

    return correct, incorrect     



correct, incorrect = predict_values(Y,survival_predictions)
print("The model predicts the correct survival rate with accuracy: {}%".format(round(100* correct/len(Y),2)))

The model predicts the correct survival rate with accuracy: 97.76%


## The test data!

In [346]:
test_data = pd.read_csv('test.csv')

In [347]:

test_data.drop(['Name','SibSp','Ticket','Cabin','Embarked'],axis = 1, inplace = True)
test_data.head()


Unnamed: 0,PassengerId,Pclass,Sex,Age,Parch,Fare
0,892,3,male,34.5,0,7.8292
1,893,3,female,47.0,0,7.0
2,894,2,male,62.0,0,9.6875
3,895,3,male,27.0,0,8.6625
4,896,3,female,22.0,1,12.2875


In [348]:

test_data['Sex'] = test_data['Sex'].replace(replacement_values)
test_data['Age'].fillna(test_data['Age'].median(),inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(),inplace = True)

# Now the data has been cleaned we can apply the random tree model

In [349]:
y_predictions_test = rf_model.predict(test_data[columns]).round()

In [350]:
resulting_data = test_data
resulting_data['Survived'] = y_predictions_test
resulting_data.drop(['Sex','Age','Parch','Fare','Pclass'],axis = 1, inplace= True)

In [351]:
resulting_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,1.0
3,895,1.0
4,896,0.0


In [352]:
resulting_data.head()

resulting_data.to_csv('prediction_1.csv', index = False)


In [353]:
resulting_data

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,1.0
3,895,1.0
4,896,0.0
...,...,...
413,1305,0.0
414,1306,1.0
415,1307,0.0
416,1308,0.0
