In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
tit = pd.read_csv("titanic_train.csv")
tit.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We will first do some cleaning on our data and then implement both random forest as well as decision tree and see which one does better

In [3]:
tit2 = tit.copy()

In [4]:
del tit2["Ticket"]
del tit2["Cabin"]
del tit2["Name"]
del tit2["Fare"]
del tit2["PassengerId"]

In [5]:
tit2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [6]:
meanSur = tit2[tit2.Survived == 1].Age.mean()
meanSur

28.343689655172415

In [7]:
meanNSur = tit2[tit2.Survived == 0].Age.mean()
meanNSur

30.62617924528302

In [8]:
# Creating new age column where survived nan Ages will be replaced by mean survived age

tit2["age"] = np.where(tit2.Age.isnull() & tit2.Survived == 1, meanSur , tit2["Age"])
tit2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,age
0,0,3,male,22.0,1,0,S,22.0
1,1,1,female,38.0,1,0,C,38.0
2,1,3,female,26.0,0,0,S,26.0
3,1,1,female,35.0,1,0,S,35.0
4,0,3,male,35.0,0,0,S,35.0


In [9]:
# Now new age columns have nan for only not survived candidates

tit2.age.fillna(meanNSur,inplace = True)
tit2.age.isnull().sum()

0

In [10]:
del tit2["Age"]

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [12]:
tit2["surv"] = tit2["Survived"]
del tit2["Survived"]

In [13]:
tit2.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,age,surv
0,3,male,1,0,S,22.0,0
1,1,female,1,0,C,38.0,1
2,3,female,0,0,S,26.0,1
3,1,female,1,0,S,35.0,1
4,3,male,0,0,S,35.0,0


In [14]:
# removing string enteries

def Gender(str):
    if str == "male":
        return 0
    return 1

tit2["Gender"] = tit2.Sex.apply(Gender)
tit2.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,age,surv,Gender
0,3,male,1,0,S,22.0,0,0
1,1,female,1,0,C,38.0,1,1
2,3,female,0,0,S,26.0,1,1
3,1,female,1,0,S,35.0,1,1
4,3,male,0,0,S,35.0,0,0


In [15]:
del tit2["Sex"]

In [16]:
# Embarked strings should also be converted

def emb(str):
    if str == 'S':
        return 0
    elif str == 'C':
        return 1
    else:
        return 2
    
tit2["NewEmbark"] = tit2.Embarked.apply(emb)
tit2.head()

Unnamed: 0,Pclass,SibSp,Parch,Embarked,age,surv,Gender,NewEmbark
0,3,1,0,S,22.0,0,0,0
1,1,1,0,C,38.0,1,1,1
2,3,0,0,S,26.0,1,1,0
3,1,1,0,S,35.0,1,1,0
4,3,0,0,S,35.0,0,0,0


In [17]:
del tit2["Embarked"]

In [18]:
tit2.head()

Unnamed: 0,Pclass,SibSp,Parch,age,surv,Gender,NewEmbark
0,3,1,0,22.0,0,0,0
1,1,1,0,38.0,1,1,1
2,3,0,0,26.0,1,1,0
3,1,1,0,35.0,1,1,0
4,3,0,0,35.0,0,0,0


In [19]:
z = tit2.values
x = z[:,0:-1]
y = z[:,-1]

print(x[0])
print(y)
#x_train , x_test , y_train , y_test = train_test_split()

[  3.   1.   0.  22.   0.   0.]
[ 0.  1.  0.  0.  0.  2.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  2.  0.
  0.  1.  0.  0.  2.  0.  0.  0.  1.  0.  2.  0.  1.  1.  2.  0.  1.  0.
  1.  0.  0.  1.  0.  0.  1.  1.  2.  0.  2.  2.  1.  0.  0.  0.  1.  0.
  1.  0.  0.  1.  0.  0.  1.  2.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  2.  0.  1.  0.  0.  1.  0.  2.  0.  1.  0.  0.  0.  1.  0.  0.  1.
  2.  0.  1.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  1.  1.  0.  0.  2.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  2.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  1.  0.  0.  1.  0.  0.
  0.  1.  0.  0.  0.  0.  2.  0.  2.  0.  0.  0.  0.  0.  1.  1.  2.  0.
  2.  0.  0.  0.  0.  1.  0.  0.  0.  1.  2.  1.  0.  0.  0.  0.  2.  1.
  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0

In [20]:
x_train, x_test , y_train, y_test = train_test_split(x,y,random_state = 1)
print(x_train.shape,x_test.shape,y_train.shape)

(668, 6) (223, 6) (668,)


In [21]:
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier(random_state = 1)

clf1.fit(x_train,y_train)
clf2.fit(x_test,y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [22]:
print("DT Score on Training data :",clf1.score(x_train,y_train))
print("RF Score on Training data :",clf2.score(x_train,y_train))

DT Score on Training data : 0.902694610778
RF Score on Training data : 0.705089820359


In [23]:
print("DT Score on Testing data :",clf1.score(x_test,y_test))
print("RF Score on Testing data :",clf2.score(x_test,y_test))

DT Score on Testing data : 0.681614349776
RF Score on Testing data : 0.874439461883


# Therefore it can be easily seen that DT performs well on Training data because it overfits it (most of the times), while RF does well better on testing data.


So, Rf is generally preffered over DT.