# Titanic - Machine Learning from Disaster
Predict survival on the Titanic and get familiar with ML basics
https://www.kaggle.com/competitions/titanic/

My best score - 0.77751 (DasikB)

In [278]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [194]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [195]:
test_data = pd.read_csv("test.csv")
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [196]:
# Solution from Kaggle tutorial - score 0.77511

from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!


## Already used columns

In [197]:
train_data.groupby('Pclass')['Survived'].mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [198]:
train_data.groupby('Sex')['Survived'].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [199]:
train_data.groupby('SibSp')['Survived'].mean()

SibSp
0    0.345395
1    0.535885
2    0.464286
3    0.250000
4    0.166667
5    0.000000
8    0.000000
Name: Survived, dtype: float64

In [200]:
train_data.groupby('Parch')['Survived'].mean()

Parch
0    0.343658
1    0.550847
2    0.500000
3    0.600000
4    0.000000
5    0.200000
6    0.000000
Name: Survived, dtype: float64

In [201]:
train_data.groupby('Parch')['Survived'].count()

Parch
0    678
1    118
2     80
3      5
4      4
5      5
6      1
Name: Survived, dtype: int64

## Fare column

In [202]:
print(f'Fare < 100:       {train_data[train_data.Fare < 100]["Survived"].mean():.2f}')
print(f'Fare = 100 - 200: {train_data[(train_data.Fare >= 100) & (train_data.Fare < 200)]["Survived"].mean():.2f}')
print(f'Fare = 200 - 300: {train_data[(train_data.Fare >= 200) & (train_data.Fare < 300)]["Survived"].mean():.2f}')
print(f'Fare = 300 - 400: {train_data[(train_data.Fare >= 300) & (train_data.Age < 400)]["Survived"].mean():.2f}')
print(f'Fare > 400:       {train_data[train_data.Fare >= 400]["Survived"].mean():.2f}')

Fare < 100:       0.36
Fare = 100 - 200: 0.76
Fare = 200 - 300: 0.65
Fare = 300 - 400: 1.00
Fare > 400:       1.00


In [203]:
train_data[train_data.Fare.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [204]:
test_data[test_data.Fare.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [205]:
test_data.groupby('Pclass')['Fare'].median()

Pclass
1    60.0000
2    15.7500
3     7.8958
Name: Fare, dtype: float64

In [206]:
test_data.Fare = test_data.Fare.fillna(value=7.8958)
test_data[test_data.Fare.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


## Title column

In [207]:
def get_title(name):
    mylist = name.split()
    title = ''
    for i in range(0,len(mylist)):
        if mylist[i][-1] == ",":
            title = mylist[i+1]
    return title

train_data['Title'] = train_data['Name'].apply(get_title)

test_data['Title'] = test_data['Name'].apply(get_title)

test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs.
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs.


In [208]:
train_data.groupby('Title').count()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Capt.,1,1,1,1,1,1,1,1,1,1,1,1
Col.,2,2,2,2,2,2,2,2,2,2,1,2
Don.,1,1,1,1,1,1,1,1,1,1,0,1
Dr.,7,7,7,7,7,6,7,7,7,7,3,7
Jonkheer.,1,1,1,1,1,1,1,1,1,1,0,1
Lady.,1,1,1,1,1,1,1,1,1,1,1,1
Major.,2,2,2,2,2,2,2,2,2,2,2,2
Master.,40,40,40,40,40,36,40,40,40,40,7,40
Miss.,182,182,182,182,182,146,182,182,182,182,47,181
Mlle.,2,2,2,2,2,2,2,2,2,2,2,2


In [209]:
test_data.groupby('Title').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Col.,2,2,2,2,2,2,2,2,2,2,2
Dona.,1,1,1,1,1,1,1,1,1,1,1
Dr.,1,1,1,1,1,1,1,1,1,1,1
Master.,21,21,21,21,17,21,21,21,21,2,21
Miss.,78,78,78,78,64,78,78,78,78,11,78
Mr.,240,240,240,240,183,240,240,240,240,42,240
Mrs.,72,72,72,72,62,72,72,72,72,32,72
Ms.,1,1,1,1,0,1,1,1,1,0,1
Rev.,2,2,2,2,2,2,2,2,2,0,2


In [210]:
train_data.groupby('Title')[['Survived']].mean()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Capt.,0.0
Col.,0.5
Don.,0.0
Dr.,0.428571
Jonkheer.,0.0
Lady.,1.0
Major.,0.5
Master.,0.575
Miss.,0.697802
Mlle.,1.0


In [211]:
def simplification(title):
    if title == 'Capt.' or title == 'Dona.':
        return 'Don.'
    elif title == 'Jonkheer.' or title == 'Sir.' or title == 'the':
        return 'Mr.'
    elif title == 'Lady.':
        return 'Mrs.'
    elif title == 'Major.':
        return 'Col.'
    elif title == 'Mlle.' or title == 'Mme.':
        return 'Miss.'
    else:
        return title

train_data['Title'] = train_data['Title'].apply(simplification)
test_data['Title'] = test_data['Title'].apply(simplification)

train_data.groupby('Title')[['Survived']].mean()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Col.,0.5
Don.,0.0
Dr.,0.428571
Master.,0.575
Miss.,0.702703
Mr.,0.159615
Mrs.,0.793651
Ms.,1.0
Rev.,0.0


## Age column

In [212]:
print(f'Age < 10:      {train_data[train_data.Age < 10]["Survived"].mean():.2f}')
print(f'Age = 10 - 18: {train_data[(train_data.Age >= 10) & (train_data.Age < 18)]["Survived"].mean():.2f}')
print(f'Age = 18 - 30: {train_data[(train_data.Age >= 18) & (train_data.Age < 30)]["Survived"].mean():.2f}')
print(f'Age = 30 - 40: {train_data[(train_data.Age >= 30) & (train_data.Age < 40)]["Survived"].mean():.2f}')
print(f'Age = 40 - 50: {train_data[(train_data.Age >= 40) & (train_data.Age < 50)]["Survived"].mean():.2f}')
print(f'Age > 50:      {train_data[train_data.Age >= 50]["Survived"].mean():.2f}')

Age < 10:      0.61
Age = 10 - 18: 0.45
Age = 18 - 30: 0.35
Age = 30 - 40: 0.44
Age = 40 - 50: 0.38
Age > 50:      0.36


In [213]:
test_data[test_data.Age.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
10,902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S,Mr.
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S,Mrs.
29,921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C,Mr.
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.4500,,S,Mrs.
36,928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S,Miss.
...,...,...,...,...,...,...,...,...,...,...,...,...
408,1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q,Miss.
410,1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q,Miss.
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,Mr.
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,Mr.


In [214]:
train_data[train_data.Age.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,Mr.
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,Mrs.
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,Mr.
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,Miss.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,Mr.
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,Miss.
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,Mr.
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,Mr.


In [215]:
train_mean = train_data.groupby('Sex')['Age'].mean()
train_median = train_data.groupby('Sex')['Age'].median()

print(f'Train data mean: {train_data["Age"].mean()}')
print(train_mean)
print(f'\nTrain data median: {train_data["Age"].median()}')
print(train_median)

# -> not a big difference between age of women and men

Train data mean: 29.69911764705882
Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

Train data median: 28.0
Sex
female    27.0
male      29.0
Name: Age, dtype: float64


In [216]:
train_mean = train_data.groupby('Pclass')['Age'].mean()
train_median = train_data.groupby('Pclass')['Age'].median()

print(f'Train data mean: {train_data["Age"].mean()}')
print(train_mean)
print(f'\nTrain data median: {train_data["Age"].median()}')
print(train_median)

# -> bigger difference among classes

Train data mean: 29.69911764705882
Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

Train data median: 28.0
Pclass
1    37.0
2    29.0
3    24.0
Name: Age, dtype: float64


In [217]:
train_mean = train_data.groupby('Title')['Age'].mean()
train_median = train_data.groupby('Title')['Age'].median()

print(f'Train data mean: {train_data["Age"].mean()}')
print(train_mean)
print(f'\nTrain data median: {train_data["Age"].median()}')
print(train_median)

# -> significant difference among titles

Train data mean: 29.69911764705882
Title
Col.       53.250000
Don.       55.000000
Dr.        42.000000
Master.     4.574167
Miss.      21.818792
Mr.        32.425187
Mrs.       36.009174
Ms.        28.000000
Rev.       43.166667
Name: Age, dtype: float64

Train data median: 28.0
Title
Col.       54.0
Don.       55.0
Dr.        46.5
Master.     3.5
Miss.      21.0
Mr.        30.0
Mrs.       35.0
Ms.        28.0
Rev.       46.5
Name: Age, dtype: float64


In [218]:
train_data['Age'] = train_data.groupby('Title')['Age'].apply(lambda x: x.fillna(x.median()))
test_data['Age'] = test_data.groupby('Title')['Age'].apply(lambda x: x.fillna(x.median()))

# tried both strategies - Age n/a values filled according to median Class and Title values 

In [219]:
test_data[test_data.Age.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
88,980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q,Ms.


In [220]:
test_data['Age'] = test_data['Age'].fillna(28)

In [221]:
test_data[test_data.Age.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title


## Accuracy testing

In [222]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)

print(train_X.shape)
print(val_X.shape)

(712, 5)
(179, 5)


In [223]:
train_X.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male
301,3,2,0,0,1
309,1,0,0,1,0
516,2,0,0,1,0
120,2,2,0,0,1
570,2,0,0,0,1


In [224]:
def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_depth_list = [2,3,4,5,6,7,8,9,10]
for max_depth in max_depth_list:
    my_score = get_score(max_depth,train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Score:  %d" %(max_depth, my_score*100))
    
# -> max depth 3 better that 5

Max depth: 2  		 Score:  78
Max depth: 3  		 Score:  78
Max depth: 4  		 Score:  77
Max depth: 5  		 Score:  75
Max depth: 6  		 Score:  77
Max depth: 7  		 Score:  76
Max depth: 8  		 Score:  75
Max depth: 9  		 Score:  75
Max depth: 10  		 Score:  75


In [254]:
# max_depth = 3: 
y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# Kaggle score 0.77751 (the best result)

Your submission was successfully saved!


In [225]:
# All columns in features:
 
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare", "Age", "Title"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_depth_list = [2,3,4,5,6,7,8,9,10,11,12,13,14,15]
for max_depth in max_depth_list:
    my_score = get_score(max_depth,train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Score:  %d" %(max_depth, my_score*100))

Max depth: 2  		 Score:  78
Max depth: 3  		 Score:  79
Max depth: 4  		 Score:  80
Max depth: 5  		 Score:  80
Max depth: 6  		 Score:  78
Max depth: 7  		 Score:  78
Max depth: 8  		 Score:  78
Max depth: 9  		 Score:  78
Max depth: 10  		 Score:  81
Max depth: 11  		 Score:  80
Max depth: 12  		 Score:  79
Max depth: 13  		 Score:  77
Max depth: 14  		 Score:  76
Max depth: 15  		 Score:  78


In [237]:
# different max_leaf_nodes, n_estimators tried:

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=1,max_leaf_nodes=max_leaf_nodes)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_leaf_nodes_list = [5,25,50,100,250,500,1000,5000,10000]
for max_leaf_nodes in max_leaf_nodes_list:
    my_score = get_score(max_leaf_nodes,train_X, val_X, train_y, val_y)
    print("Max_leaf_nodes: %d  \t\t Score:  %d" %(max_leaf_nodes, my_score*100))

Max_leaf_nodes: 5  		 Score:  79
Max_leaf_nodes: 25  		 Score:  81
Max_leaf_nodes: 50  		 Score:  81
Max_leaf_nodes: 100  		 Score:  81
Max_leaf_nodes: 250  		 Score:  81
Max_leaf_nodes: 500  		 Score:  81
Max_leaf_nodes: 1000  		 Score:  81
Max_leaf_nodes: 5000  		 Score:  81
Max_leaf_nodes: 10000  		 Score:  81


In [238]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Title","Fare"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=200, max_depth=4, random_state=1, max_leaf_nodes = 50)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# max_leaf_nodes 50, max_depth = 4: Kaggle Score = 0.77272
# max_leaf_nodes 50, max_depth = 4, n_estimators = 200: Kaggle Score = 0.77511
# max_leaf_nodes 50, max_depth = 4, n_estimators = 400: Kaggle Score = 0.77511
# max_leaf_nodes 100, max_depth = 4, n_estimators = 400: Kaggle Score = 0.77511
# max_leaf_nodes 50, max_depth = 3, n_estimators = 200: Kaggle Score = 0.77511
# max_leaf_nodes 100, max_depth = 10: Kaggle Score = 0.76555
# max_leaf_nodes 50, max_depth = 10: Kaggle Score = 0.77033
# max_leaf_nodes 50, max_depth = 8: Kaggle Score = 0.76555

Your submission was successfully saved!


In [240]:
# Chosen columns in features:
 
features = ["Pclass", "Sex", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_depth_list = [2,3,4,5,6,7,8,9,10]
for max_depth in max_depth_list:
    my_score = get_score(max_depth,train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Score:  %d" %(max_depth, my_score*100))

Max depth: 2  		 Score:  77
Max depth: 3  		 Score:  80
Max depth: 4  		 Score:  79
Max depth: 5  		 Score:  79
Max depth: 6  		 Score:  81
Max depth: 7  		 Score:  81
Max depth: 8  		 Score:  84
Max depth: 9  		 Score:  83
Max depth: 10  		 Score:  82


In [243]:
# max_leaf_nodes

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=1,max_leaf_nodes=max_leaf_nodes)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_leaf_nodes_list = [5,25,50,100,250,500,1000,5000,10000]
for max_leaf_nodes in max_leaf_nodes_list:
    my_score = get_score(max_leaf_nodes,train_X, val_X, train_y, val_y)
    print("Max_leaf_nodes: %d  \t\t Score:  %d" %(max_leaf_nodes, my_score*100))

Max_leaf_nodes: 5  		 Score:  79
Max_leaf_nodes: 25  		 Score:  83
Max_leaf_nodes: 50  		 Score:  84
Max_leaf_nodes: 100  		 Score:  84
Max_leaf_nodes: 250  		 Score:  84
Max_leaf_nodes: 500  		 Score:  84
Max_leaf_nodes: 1000  		 Score:  84
Max_leaf_nodes: 5000  		 Score:  84
Max_leaf_nodes: 10000  		 Score:  84


In [246]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=200, max_depth=4, random_state=1, max_leaf_nodes = 50)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# n_estimators=100, max_depth=8, random_state=1, max_leaf_nodes = 50: Kaggle Score = 0.77033
# n_estimators=200, max_depth=4, random_state=1, max_leaf_nodes = 50: Kaggle Score = 0.77511
    # -> better accuracy score does not lead to better Kaggle score

Your submission was successfully saved!


Best accuracy score on validation data, but worse kaggle score on test data - why?
- Accuracy score is not a good choice in this case?
- Overfitting?
- Some other reason?

## Classification reports

In [248]:
# Kaggle score 0.77511
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)


model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
print(classification_report(val_y,predictions))

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       106
           1       0.74      0.63      0.68        73

    accuracy                           0.76       179
   macro avg       0.76      0.74      0.74       179
weighted avg       0.76      0.76      0.76       179



In [249]:
# conditions with improved Kaggle score - 0.77751

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)


model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
print(classification_report(val_y,predictions))

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       106
           1       0.80      0.62      0.70        73

    accuracy                           0.78       179
   macro avg       0.79      0.76      0.76       179
weighted avg       0.79      0.78      0.78       179



In [253]:
# all columns - Kaggle score 0.77511
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Title","Fare"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)


model = RandomForestClassifier(n_estimators=200, max_depth=4, random_state=1, max_leaf_nodes=50)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
print(classification_report(val_y,predictions))

# -> slightly better scores in classification report, but worse Kaggle score

              precision    recall  f1-score   support

           0       0.80      0.90      0.84       106
           1       0.82      0.67      0.74        73

    accuracy                           0.80       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.80      0.80       179



## F1 score

In [257]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)


model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
print(f1_score(val_y,predictions))

0.6976744186046512


In [255]:
# all features
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Title","Fare"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)


model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
print(f1_score(val_y,predictions))

0.7352941176470589


In [256]:
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Title","Fare"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42, test_size=0.2)


model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42, max_leaf_nodes = 50)
model.fit(train_X, train_y)
predictions = model.predict(val_X)
print(f1_score(val_y,predictions))

0.7832167832167832


In [162]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Title","Fare"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42, max_leaf_nodes = 50)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# kaggle score - 0.77511

Your submission was successfully saved!


## Further classifiers

In [71]:
# DecisionTreeClassifier

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=1, max_leaf_nodes=10)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_depth_list = [2,3,4,5,6,7,8,9,10]
for max_depth in max_depth_list:
    my_score = get_score(max_depth,train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Score:  %d" %(max_depth, my_score*100))

Max depth: 2  		 Score:  72
Max depth: 3  		 Score:  80
Max depth: 4  		 Score:  77
Max depth: 5  		 Score:  78
Max depth: 6  		 Score:  78
Max depth: 7  		 Score:  78
Max depth: 8  		 Score:  78
Max depth: 9  		 Score:  78
Max depth: 10  		 Score:  78


In [53]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Title","Fare"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = DecisionTreeClassifier(max_depth=6, random_state=1, max_leaf_nodes=10)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# Score = 0.76555 - lower than RandomForestClassifier

Your submission was successfully saved!


In [258]:
# Gradient Boosting Classifier

features = ["Pclass", "Sex", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = GradientBoostingClassifier(max_depth=max_depth, random_state=1, max_leaf_nodes = 15)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_depth_list = [1,2,3,4,5,6,7,8,9,10]
for max_depth in max_depth_list:
    my_score = get_score(max_depth,train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Score:  %d" %(max_depth, my_score*100))

Max depth: 1  		 Score:  77
Max depth: 2  		 Score:  81
Max depth: 3  		 Score:  79
Max depth: 4  		 Score:  81
Max depth: 5  		 Score:  79
Max depth: 6  		 Score:  81
Max depth: 7  		 Score:  81
Max depth: 8  		 Score:  81
Max depth: 9  		 Score:  80
Max depth: 10  		 Score:  78


In [267]:
# + Parch

features = ["Pclass", "Sex", "Fare", "Age", "Parch"]
X = pd.get_dummies(train_data[features])
y = train_data["Survived"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)

def get_score(max_depth,train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(n_estimators=100, max_depth=max_depth, max_leaf_nodes = 100)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    score = accuracy_score(val_y, predictions)
    return score

max_depth_list = [2,3,4,5,6,7,8,9,10,11,12,13,14,15]
for max_depth in max_depth_list:
    my_score = get_score(max_depth,train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Score:  %d" %(max_depth, my_score*100))
    


Max depth: 2  		 Score:  77
Max depth: 3  		 Score:  79
Max depth: 4  		 Score:  79
Max depth: 5  		 Score:  81
Max depth: 6  		 Score:  81
Max depth: 7  		 Score:  79
Max depth: 8  		 Score:  79
Max depth: 9  		 Score:  83
Max depth: 10  		 Score:  81
Max depth: 11  		 Score:  79
Max depth: 12  		 Score:  79
Max depth: 13  		 Score:  79
Max depth: 14  		 Score:  81
Max depth: 15  		 Score:  81


In [259]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "Fare", "Age", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth = 9, random_state=1,max_leaf_nodes = 100)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# kaggle score - 0.77033

Your submission was successfully saved!


In [72]:
# SVC model

y = train_data["Survived"]

features = ["Pclass", "Sex", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = SVC(C=4)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# kaggle score: 0.66985

Your submission was successfully saved!
