In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

train_data = pd.read_csv('train.csv')


In [4]:
def clean_data(data,columns):
    #First we remove the columns we don't need

    data_clean = data.drop(columns,axis = 1)
    return data_clean

columns = ['Ticket','Cabin','Embarked']

data_clean = clean_data(train_data,columns)
data_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05


In [5]:
#Creating the median age for the unknown values 

pd.isnull(data_clean['Age']).sum() #The number of indidivuals with unknown age
data_clean['Age'].fillna(data_clean['Age'].median(),inplace= True) #The median age

In [6]:
pd.isnull(data_clean).sum() == 0 #All the values have been thus assigned

PassengerId    True
Survived       True
Pclass         True
Name           True
Sex            True
Age            True
SibSp          True
Parch          True
Fare           True
dtype: bool

In [7]:
replacement_values = {'male': 1, 
                      'female': 0}
data_clean['Sex'] = data_clean['Sex'].replace(replacement_values)

data_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05


In [8]:
titanic_model = DecisionTreeRegressor(random_state=1)

In [9]:
columns = ['Sex','Fare','Age']
X = data_clean[columns]
X.describe()
Y = data_clean['Survived']

titanic_model.fit(X,Y)

In [10]:
print('Survival prediction for the following:')
print(X.head())
titanic_model.predict(X.head(10))

Survival prediction for the following:
   Sex     Fare   Age
0    1   7.2500  22.0
1    0  71.2833  38.0
2    0   7.9250  26.0
3    0  53.1000  35.0
4    1   8.0500  35.0


array([0., 1., 1., 1., 0., 0., 0., 0., 1., 1.])

### Splitting the training data into a test set and training set

In [29]:
from sklearn.model_selection import train_test_split

#So in this particular case we're splitting up the training data 
#THis is important as we can train the model and then compare the results to the test half of the data 

train_X, val_X, train_Y, val_Y = train_test_split(X,Y,random_state=1, test_size=0.3, shuffle = True) #Split the model into Training data and test

### Using the split data to work out the perfect number of tree nodes! Finding sweet spot

In [30]:
def get_mae(max_leaf_nodes,train_X,val_X,train_Y,val_Y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state = 1)
    model.fit(train_X,train_Y)
    pred_val = model.predict(val_X)
    mae = mean_absolute_error(val_Y,pred_val)
    return mae

In [31]:
nodes = [5,25,50,100,150,200,300,400,500,1000]
mae_values = []
for node in nodes:
    mae_values.append(get_mae(node,train_X,val_X,train_Y,val_Y))

print(mae_values)

[0.3257355963483461, 0.2953307981116041, 0.2859791031444292, 0.256023420509042, 0.2561567164179105, 0.2561567164179105, 0.2561567164179105, 0.2561567164179105, 0.2561567164179105, 0.2561567164179105]


## Using random forests

In [32]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)

rf_model.fit(train_X,train_Y)

mean_diff = mean_absolute_error(rf_model.predict(val_X),val_Y)
print("The mean error for this Random Forest: {}".format(mean_diff))


The mean error for this Random Forest: 0.28598556863948654


### Applying this to the entire data set

In [33]:
'''
rf_model.fit(X,Y)

survival_predictions = rf_model.predict(X)
#We need to round the predicted_values to the nearest integer

survival_predictions = np.round(survival_predictions)

#I want to present the total values that this model predicts correctly!

def predict_values(true_values,predicted_values):
    correct = 0
    incorrect = 0
    for i in range(len(true_values)):
        if true_values[i] == predicted_values[i]:
         correct += 1
        else:
           incorrect += 1

    return correct, incorrect     



correct, incorrect = predict_values(Y,survival_predictions)
print("The model predicts the correct survival rate with accuracy: {}%".format(round(100* correct/len(Y),2)))
'''

#This is NOT GOOD

'\nrf_model.fit(X,Y)\n\nsurvival_predictions = rf_model.predict(X)\n#We need to round the predicted_values to the nearest integer\n\nsurvival_predictions = np.round(survival_predictions)\n\n#I want to present the total values that this model predicts correctly!\n\ndef predict_values(true_values,predicted_values):\n    correct = 0\n    incorrect = 0\n    for i in range(len(true_values)):\n        if true_values[i] == predicted_values[i]:\n         correct += 1\n        else:\n           incorrect += 1\n\n    return correct, incorrect     \n\n\n\ncorrect, incorrect = predict_values(Y,survival_predictions)\nprint("The model predicts the correct survival rate with accuracy: {}%".format(round(100* correct/len(Y),2)))\n'

In [34]:
from sklearn.metrics import accuracy_score
def predict_score(model, train_X, val_X, train_Y, val_Y):
    model.fit(train_X, train_Y)
    preds = model.predict(val_X)

    # Convert predictions to binary values (0 or 1)
    preds_binary = preds.round().astype(int)

    accuracy = accuracy_score(val_Y, preds_binary)
    return accuracy * 100


In [35]:
model_1 = RandomForestRegressor(n_estimators=50, random_state=1)
model_2 = RandomForestRegressor(n_estimators=200, random_state=1)
model_3 = RandomForestRegressor(n_estimators=100, random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

predict_score(model_1, train_X, val_X, train_Y, val_Y)

for model_type in models:
    print(predict_score(model_type, train_X, val_X, train_Y, val_Y))

    

74.25373134328358
75.74626865671642
74.6268656716418
76.49253731343283
77.61194029850746


## The test data!

In [37]:
test_data = pd.read_csv('test.csv')

In [38]:

test_data.drop(['Name','SibSp','Ticket','Cabin','Embarked'],axis = 1, inplace = True)
test_data.head()


Unnamed: 0,PassengerId,Pclass,Sex,Age,Parch,Fare
0,892,3,male,34.5,0,7.8292
1,893,3,female,47.0,0,7.0
2,894,2,male,62.0,0,9.6875
3,895,3,male,27.0,0,8.6625
4,896,3,female,22.0,1,12.2875


In [39]:

test_data['Sex'] = test_data['Sex'].replace(replacement_values)
test_data['Age'].fillna(test_data['Age'].median(),inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(),inplace = True)

# Now the data has been cleaned we can apply the random tree model

In [40]:

model_choice = model_5
y_predictions_test = model_choice.predict(test_data[columns]).round().astype(int)

In [41]:
resulting_data = test_data
resulting_data['Survived'] = y_predictions_test
resulting_data.drop(['Sex','Age','Parch','Fare','Pclass'],axis = 1, inplace= True)

In [42]:
resulting_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [43]:
submission = pd.DataFrame({
    'PassengerId': resulting_data['PassengerId'],
    'Survived': y_predictions_test
})



submission.to_csv('submission.csv', index=False)