In [None]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('data/train.csv')

In [None]:
data

In [None]:
data.info()

<h2>Missing values</h2>
First we need to check if there are any missing values

In [None]:
data.isnull().sum()

It's looks like cabin feature is completely useless because almost every value is missing. In age column there is many missing values too, but it's not that bad like with cabin column. If I want to use this feature there is no other option than to fill it with values. I will use median to do so. <br>

In [None]:
# Fill Age column with median
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill Embarked with most occuring values
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Delete Cabin, PassengerId feature
data_droped = data.drop(['Cabin', 'PassengerId'], axis=1)

In [None]:
data_droped

<h2>Single variable visualization</h2>

In [None]:
fig = go.Figure()

to_plot = data_droped.Survived.replace({0: 'Dead', 1: 'Survived'}).value_counts()
labels = to_plot.index
values = to_plot.values

fig.add_trace(go.Pie(
    labels = labels,
    values = values,
    textinfo='label+percent'
))

fig.update_layout(
    title_text='Survival',
    template='plotly_dark'
)

In [None]:
fig = go.Figure()

to_plot = data_droped.Pclass.replace({1: 'First Class', 2: 'Second Class', 3: 'Third Class'}).value_counts()
labels = to_plot.index
values = to_plot.values

fig.add_trace(go.Pie(
    labels = labels,
    values = values,
    textinfo='label+percent'
))

fig.update_layout(
    title_text='Ticket class',
    template='plotly_dark'
)

In [None]:
fig = go.Figure()

to_plot = data_droped.Sex.replace({'male': 'Male', 'female': 'Female'}).value_counts()
labels = to_plot.index
values = to_plot.values

fig.add_trace(go.Pie(
    labels = labels,
    values = values,
    textinfo='label+percent'
))

fig.update_layout(
    title_text='Gender',
    template='plotly_dark'
)

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_droped.Age,
    name = 'Age'
))

fig.update_layout(
    title_text = 'Age box plot',
    template = 'plotly_dark'
)

This plot reveals that in Age column there is a lot of outliers which in reality aren't that bad, mostly because it's possible that some old lady was on the ship or very young boy. We definitly don't want to get rid of them. This is the reason why I chose median to fill missing values instead of mean. Median is much more resistant for outliers.

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_droped.Fare,
    name = 'Passenger fare'
))

fig.update_layout(
    title_text = 'Fare box plot',
    template = 'plotly_dark'
)

It's looks like we need to do something with outliers here, but first let's check if this is some kind of mistake or someone really paid that much.

In [None]:
data_droped[data_droped['Fare'] > 500]

If we enter names of those people in google we can easily find that they really paid £512 for their tickets. Leaving them on can lead to confusion that's why I will delete them and consider them separately.

In [None]:
# Delete outliers
data_droped_out = data_droped[data_droped['Fare'] < 500]

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_droped_out.Fare,
    name = 'Passenger fare'
))

fig.update_layout(
    title_text = 'Fare box plot',
    template = 'plotly_dark'
)

It's still look not that good but it's better than before and we will leave it like this.

<h2>Questions</h2>
Now it's time for couple of interesting questions which we can answer using our data:
<ul>
<li>Does gender have a big impact on survival?</li>
<li>Were people saved by families?</li>
<li>How much money people have spent to participate in the cruise?</li>
</ul>

<h3> Does gender have a big impact on survival? </h3>

In [None]:
fig = go.Figure()

data_female_toplot = data_droped_out[data_droped_out['Sex'] == 'female']['Survived'].replace({0: 'Dead', 1: 'Survived'})
data_male_toplot = data_droped_out[data_droped_out['Sex'] == 'male']['Survived'].replace({0: 'Dead', 1: 'Survived'})

fig.add_trace(go.Histogram(
    x=data_female_toplot,
    name='Female',
    marker_color='#EB89B5',
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x=data_male_toplot,
    name='Male',
    marker_color='#330C73',
    opacity=0.75
))

fig.update_layout(
    title_text='Survivors by Gender',
    template='plotly_dark',
)

From the chart we see that gender played important factor in survival. It's clear that women were the first to enter the lifeboats.

<h3> Were people saved by families? </h3>
We want to check whether if one family member survived, it means that the rest of the family members were also saved. I'm assuming that people with the same ticket are from the same family.

In [None]:
# Add family size feature
data_droped_out['Family_size'] = data_droped_out['SibSp'] + data_droped_out['Parch']

In [None]:
data_droped_out

In [None]:
data_droped_out.groupby(['Family_size', 'Survived']).count()

In [None]:
data_droped_out[data_droped_out['Family_size'] > 0].groupby(['Ticket', 'Survived']).count().head(10)

Maybe my guess that tickets are the same for all family members is correct but usually not every person from the family is listed in our dataset. It simply means that we don't know if they survived or died, that's why our question cannot be answered using this dataset. 

<h3>How much money people have spent to participate in the cruise?</h3>

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=data_droped_out['Fare'],
    name='Female',
    marker_color='#EB89B5',
    opacity=0.75
))

fig.update_layout(
    xaxis_title_text='Fare',
    yaxis_title_text='Number of people',
    title_text='Fare histogram',
    template='plotly_dark',
)

Most people paid a modest amount of £ 5 - £ 15. It means that most of the passengers resided in the third or second class. We could notice this result in a pie chart earlier which showed number of people in every class.