## Titanic Dataset

In [28]:
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn import tree
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
train_df = pd.read_csv("C:/Users/majay/Desktop/Python/titanic.csv")
test_df = pd.read_csv("C:/Users/majay/Desktop/Python/titanic_test.csv")

In [29]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
# Lets see the row and column count 

train_df.shape

(891, 12)

So we have  891 rows and 12 columns in our dataset

You can see 'sex' is an important column and is a categorical data. Hence we need to convert it to numerical since machines cant 
understand categorical data.

In [31]:
train_df["Sex"] = train_df["Sex"].apply(lambda Sex: 0 if Sex == 'male' else 1)

In [32]:
# Now lets check. now we can numerical data

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [33]:
y = targets = labels = train_df["Survived"].values  # output 

columns = ["Fare", "Pclass", "Sex", "Age", "SibSp"] # Input features
features = train_df[list(columns)].values
features

array([[  7.25  ,   3.    ,   0.    ,  22.    ,   1.    ],
       [ 71.2833,   1.    ,   1.    ,  38.    ,   1.    ],
       [  7.925 ,   3.    ,   1.    ,  26.    ,   0.    ],
       ..., 
       [ 23.45  ,   3.    ,   1.    ,      nan,   1.    ],
       [ 30.    ,   1.    ,   0.    ,  26.    ,   0.    ],
       [  7.75  ,   3.    ,   0.    ,  32.    ,   0.    ]])

In [34]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # Lets fill the missing values using imputer
X = imp.fit_transform(features)
X

array([[  7.25      ,   3.        ,   0.        ,  22.        ,   1.        ],
       [ 71.2833    ,   1.        ,   1.        ,  38.        ,   1.        ],
       [  7.925     ,   3.        ,   1.        ,  26.        ,   0.        ],
       ..., 
       [ 23.45      ,   3.        ,   1.        ,  29.69911765,   1.        ],
       [ 30.        ,   1.        ,   0.        ,  26.        ,   0.        ],
       [  7.75      ,   3.        ,   0.        ,  32.        ,   0.        ]])

In [35]:
my_tree_one = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
my_tree_one = my_tree_one.fit(X, y)

In [36]:
#The feature_importances_ attribute make it simple to interpret the significance of the predictors you include
print(my_tree_one.feature_importances_) 
print(my_tree_one.score(X, y))

[ 0.12330431  0.18665493  0.5670424   0.09423074  0.02876762]
0.822671156004


Now lets handle test data -

In [37]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [38]:
# Even here we convert categorical to numerical values
test_df["Sex"] = test_df["Sex"].apply(lambda Sex: 0 if Sex == 'male' else 1)

features_test = test_df[list(columns)].values
imp_test = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_test = imp_test.fit_transform(features_test)
X_test

array([[  7.8292    ,   3.        ,   0.        ,  34.5       ,   0.        ],
       [  7.        ,   3.        ,   1.        ,  47.        ,   1.        ],
       [  9.6875    ,   2.        ,   0.        ,  62.        ,   0.        ],
       ..., 
       [  7.25      ,   3.        ,   0.        ,  38.5       ,   0.        ],
       [  8.05      ,   3.        ,   0.        ,  30.27259036,   0.        ],
       [ 22.3583    ,   3.        ,   0.        ,  30.27259036,   1.        ]])

In [39]:
pred = my_tree_one.predict(X_test) # Lets predict the output using the input features

In [40]:
#Print Confusion matrix 
pred = my_tree_one.predict(X)
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion

array([[490,  59],
       [ 99, 243]], dtype=int64)

In [41]:
# Lets see the accuracy of our model.
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y, pred)
print(accuracy)

0.822671156004


We see that there is still scope for improvement. 

Lets change the depth and consider the split.

In [42]:
#Setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(X, y)

#Print the score of the new decison tree
print(my_tree_two.score(X, y))

0.901234567901


In [43]:
pred = my_tree_two.predict(X)

In [44]:
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion

array([[533,  16],
       [ 72, 270]], dtype=int64)

In [27]:
# Lets see the accuracy of our model.
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y, pred)
print(accuracy)

0.901234567901


We have made a significant improvement. 