## Import necessary modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
%matplotlib inline

## Read and understand the training dataset

In [None]:
traindf = pd.read_csv(r'../input/titanic/train.csv')
null_values = traindf.isnull().sum()
print('Check for null Values :\n', null_values)
print('Check basic statistics: \n', traindf.describe(include='all'))
print('Check number of unique entries: \n', traindf.nunique())
print('Check for Data Types: \n', traindf.dtypes)

## Data Cleaning
- Dropping unnecessary columns
- Dropping rows with missing values
- Assigning integers to categorical variables

In [None]:
traindf = traindf.drop(['Cabin', 'Name', 'PassengerId', 'Ticket', 'SibSp', 'Parch', 'Age'], axis=1)
traindf = traindf.dropna(axis=0, subset=['Embarked'])
traindf.Sex = traindf.Sex.map({'male': 0, 'female': 1})
traindf.Embarked = traindf.Embarked.map({'S': 0, 'C': 1, 'Q': 2})
traindf

## Create a correlation matrix to check dependencies

In [None]:
correlation_matrix = traindf.corr()
correlation_matrix['Survived'].sort_values(ascending=False)

## Create a Decision Tree model

In [None]:
featuresdf = traindf[list(traindf.columns[1:])]
targetdf = traindf['Survived']
clf = tree.DecisionTreeClassifier()
model = clf.fit(featuresdf, targetdf)
plt.title('Decision Tree')
tree.plot_tree(model, feature_names=list(traindf.columns[1:]), class_names=['Died', 'Survived'], filled=True, rounded=True,
              proportion=True, rotate=True)

## Get the testing dataset and clean it

In [None]:
titanictestdf = pd.read_csv(r'../input/titanic/test.csv')
testdf = titanictestdf.drop(['Cabin', 'Name', 'PassengerId', 'Ticket','SibSp', 'Parch', 'Age'], axis=1)
testdf = testdf.dropna(axis=0, subset=['Embarked'])
testdf.Sex = testdf.Sex.map({'male': 0, 'female': 1})
testdf.Embarked = testdf.Embarked.map({'S': 0, 'C': 1, 'Q': 2})
testdf

## Check for missing values

In [None]:
testdf.isnull().sum()

## Get the details for missing value

In [None]:
testdf.loc[testdf[testdf['Fare'].isnull()].index.to_list()]

## Get all the information on index 152

In [None]:
titanictestdf.loc[152]

## We will check if we can predict the missing value by using Pclass

In [None]:
sns.swarmplot(traindf.Pclass, traindf.Fare)


## Get the average of fares in each ticket class (in both training and testing datasets)

In [None]:
traindf.groupby('Pclass')['Fare'].agg(np.mean)

In [None]:
testdf.dropna().groupby('Pclass')['Fare'].agg(np.mean)

## Assign the average of these two results of 3rd class ticket averages to the missing value

In [None]:
testdf.loc[152, 'Fare'] = np.mean([testdf.dropna().groupby('Pclass')['Fare'].agg(np.mean).loc[3],traindf.dropna().groupby('Pclass')['Fare'].agg(np.mean).loc[3]])
testdf.loc[152,'Fare'] 

## Predict the survival using created model

In [None]:
model.predict(testdf)

## Add the prediction in the Survived column against passenger Id in a new dataframe

In [None]:
survival_prediction = titanictestdf.copy()
survival_prediction['Survived'] = model.predict(testdf)
survival_prediction = survival_prediction[['PassengerId', 'Survived']].set_index('PassengerId')
survival_prediction

## Save the result

In [None]:
survival_prediction.to_csv('survival_prediction.csv')

***The model turns out to be 77.99% accurate***