In [1]:
# import of libraries
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# read train data
titanic_data = pd.read_csv(r'datasets\train.csv')
titanic_data.head()
# read test data
titanic_data_test = pd.read_csv(r'datasets\test.csv')
titanic_data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# chech missed values
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# drop some useless data, X is an input parameters
X = titanic_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin', 'Fare'], axis=1)
X_final_test = titanic_data_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare'], axis=1)
# replace categorial variables by numbers
X = pd.get_dummies(X)
X_final_test = pd.get_dummies(X_final_test)
# fill missed age values by median
X = X.fillna({'Age' : X.Age.median()})
X_final_test = X_final_test.fillna({'Age' : X_final_test.Age.median()})
X.head()
X_final_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,0,1,0,1,0
1,3,47.0,1,0,1,0,0,0,1
2,2,62.0,0,0,0,1,0,1,0
3,3,27.0,0,0,0,1,0,0,1
4,3,22.0,1,1,1,0,0,0,1


In [5]:
# y is an output
y = titanic_data['Survived']
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [6]:
#split train dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [7]:
# classifier
clf = tree.DecisionTreeClassifier()
# search ob best parameters
parameters = {'max_depth':range(1, 11), 'min_samples_split':range(2, 11), 'min_samples_leaf':range(1, 11),
             'criterion':['entropy', 'gini']}
search = GridSearchCV(clf, parameters, cv=5)
search.fit(X_train, y_train)
best_clf = search.best_estimator_

In [8]:
# accuracy
print('train data accuracy: ', best_clf.score(X_train, y_train))
print('test data accuracy: ', best_clf.score(X_test, y_test))

train data accuracy:  0.8907185628742516
test data accuracy:  0.8071748878923767


In [9]:
# precision, recall
predictions = best_clf.predict(X_test)
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
print('precision: ', precision, '\nrecall: ', recall)

precision:  0.8071748878923767 
recall:  0.8071748878923767


In [10]:
#X_final_test.isnull().sum()
final_predictions = best_clf.predict(X_final_test)