In [165]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [166]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [167]:
# create a set titles
titles = set()

# iterate over the Name column
for name in  train['Name']:
    
    # split the value at every row on (','), ('.'), then delete the rest
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

{'Major', 'Capt', 'Master', 'Mlle', 'Sir', 'Lady', 'Col', 'the Countess', 'Jonkheer', 'Ms', 'Miss', 'Don', 'Dr', 'Mrs', 'Rev', 'Mr', 'Mme'}


In [168]:
# initialize a dictionary Title_Dictionary 
Title_Dictionary = {"Capt": "Officer",
                    "Col": "Officer",
                    "Major": "Officer",
                    "Jonkheer": "Royalty",
                    "Don": "Royalty",
                    "Sir" : "Royalty",
                    "Dr": "Officer",
                    "Rev": "Officer",
                    "the Countess":"Royalty",
                    "Mme": "Mrs",
                    "Mlle": "Miss",
                    "Ms": "Mrs",
                    "Mr" : "Mr",
                    "Mrs" : "Mrs",
                    "Miss" : "Miss",
                    "Master" : "Master",
                    "Lady" : "Royalty"}

In [169]:
#  Put the titles into a new column Title
train['Title'] = train['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())

# Map the different values of Title_Dictionary to keys Officer, Royalty, Mrs, Mr, Miss, Master
train['Title'] = train.Title.map(Title_Dictionary)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [170]:
# df1 (train dataset) drop columns
df1=train.drop(['Name','Ticket','Cabin','PassengerId',], axis=1)
df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.25,S,Mr
1,1,1,female,38.0,1,0,71.2833,C,Mrs
2,1,3,female,26.0,0,0,7.925,S,Miss
3,1,1,female,35.0,1,0,53.1,S,Mrs
4,0,3,male,35.0,0,0,8.05,S,Mr


In [171]:
# convert categorical variables into numericalvariables

df1.Sex=df1.Sex.map({'female':0, 'male':1})
df1.Embarked=df1.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'NaN'})
df1.Title=df1.Title.map({'Mr':0, 'Miss':1, 'Mrs':2,'Master':3,'Officer':4,'Royalty':5})
df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.25,0,0
1,1,1,0,38.0,1,0,71.2833,1,2
2,1,3,0,26.0,0,0,7.925,0,1
3,1,1,0,35.0,1,0,53.1,0,2
4,0,3,1,35.0,0,0,8.05,0,0


In [172]:
# compute median for men and women
median_age_men=df1[df1['Sex']==1]['Age'].median()     // 29
median_age_women=df1[df1['Sex']==0]['Age'].median()   // 27

In [173]:
# impute the medians into missing values 
df1.loc[(df1.Age.isnull()) & (df1['Sex']==0),'Age']=median_age_women
df1.loc[(df1.Age.isnull()) & (df1['Sex']==1),'Age']=median_age_men

In [174]:
# drop all null values
df1.dropna(inplace=True)

In [175]:
df1.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64

In [176]:
# normalizing or rescaling the Age and Fare columns to fall between 0 and 1
# this is used commonly with ML models to improve their performance

# normalizing the age column
df1.Age = (df1.Age-min(df1.Age))/(max(df1.Age)-min(df1.Age))

# normalizing the fare column
df1.Fare = (df1.Fare-min(df1.Fare))/(max(df1.Fare)-min(df1.Fare))

# Training the model

In [177]:
# split the datasets
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(['Survived'], axis=1),
    df1.Survived,
    test_size= 0.2,
    random_state=0,
    stratify=df1.Survived
)

In [178]:
# Decision tree with entropy
clf_entropy = DecisionTreeClassifier(
    criterion = "entropy", random_state = 100,
    max_depth = 3, min_samples_leaf = 5)

clf_entropy.fit(X_train, y_train)

Y_pred = clf_entropy.predict(X_test)

accuracy_score(y_test, Y_pred)

0.848314606741573

In [179]:
cm=confusion_matrix(y_test, Y_pred)
cm

array([[98, 12],
       [15, 53]], dtype=int64)

In [180]:
# convert titles into a set
titles = set()

# for loop splitting the Name column in the test dataset
for name in test['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

{'Master', 'Col', 'Dona', 'Ms', 'Miss', 'Dr', 'Mrs', 'Rev', 'Mr'}


In [181]:
# using the same dictionary as with the train dataset
# mapping the titles into 
test['Title'] = test['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
test['Title'] = test.Title.map(Title_Dictionary)
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs


In [182]:
# drop columns
df2=test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [183]:
# convert categorical variables into numericalvariables
df2.Sex=df2.Sex.map({'female':0, 'male':1})
df2.Embarked=df2.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'nan'})
df2.Title=df2.Title.map({'Mr':0, 'Miss':1, 'Mrs':2,'Master':3,'Officer':4,'Royalty':5})
df2.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,1,34.5,0,0,7.8292,2,0.0
1,3,0,47.0,1,0,7.0,0,2.0
2,2,1,62.0,0,0,9.6875,2,0.0
3,3,1,27.0,0,0,8.6625,0,0.0
4,3,0,22.0,1,1,12.2875,0,2.0


In [184]:
# compute medians for men and women 
median_age_men2=df2[df2['Sex']==1]['Age'].median()    // 27
median_age_women2=df2[df2['Sex']==0]['Age'].median()  // 27

In [185]:
# impute the medians where age == 0
df2.loc[(df2.Age.isnull()) & (df2['Sex']==0),'Age']=median_age_women2
df2.loc[(df2.Age.isnull()) & (df2['Sex']==1),'Age']=median_age_men2

In [186]:
# compute median for Fare column and impute in that one spot
df2['Fare']=df2['Fare'].fillna(df2['Fare'].median())

In [187]:
df2.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       1
dtype: int64

In [188]:
# filling out the one spot where Title is not specified with 2 (Mrs)
df2=df2.fillna(2)

In [189]:
df2.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64

In [190]:
# normalizing or rescaling the Age and Fare columns to fall between 0 and 1
# this is used commonly with ML models to improve their performance

# normalizing the age column
df2.Age = (df2.Age-min(df2.Age))/(max(df2.Age)-min(df2.Age))

# normalizing the fare column
df2.Fare = (df2.Fare-min(df2.Fare))/(max(df2.Fare)-min(df2.Fare))

In [191]:
df2.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,1,0.452723,0,0,0.015282,2,0.0
1,3,0,0.617566,1,0,0.013663,0,2.0
2,2,1,0.815377,0,0,0.018909,2,0.0
3,3,1,0.353818,0,0,0.016908,0,0.0
4,3,0,0.287881,1,1,0.023984,0,2.0


In [192]:
pred = clf_entropy.predict(df2)

In [193]:
pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [194]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": pred
    })
# create a csv file 
submission.to_csv('decision-tree.csv', index=False)

In [195]:
pred_df = pd.read_csv('decision-tree.csv')

Official score received: 0.77511