In [1]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


<h2>Missing values</h2>
First we need to check if there are any missing values

In [5]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

It's looks like cabin feature is completely useless because almost every value is missing. In age column there is many missing values too, but it's not that bad like with cabin column. If I want to use this feature there is no other option than to fill it with values. I will use median to do so. <br>

In [6]:
# Fill Age column with median
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill Embarked with most occuring values
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Delete Cabin, PassengerId feature
data_droped = data.drop(['Cabin', 'PassengerId'], axis=1)

In [7]:
data_droped

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


<h2>Single variable visualization</h2>

In [8]:
fig = go.Figure()

to_plot = data_droped.Survived.replace({0: 'Dead', 1: 'Survived'}).value_counts()
labels = to_plot.index
values = to_plot.values

fig.add_trace(go.Pie(
    labels = labels,
    values = values,
    textinfo='label+percent'
))

fig.update_layout(
    title_text='Survival',
    template='plotly_dark'
)

In [9]:
fig = go.Figure()

to_plot = data_droped.Pclass.replace({1: 'First Class', 2: 'Second Class', 3: 'Third Class'}).value_counts()
labels = to_plot.index
values = to_plot.values

fig.add_trace(go.Pie(
    labels = labels,
    values = values,
    textinfo='label+percent'
))

fig.update_layout(
    title_text='Ticket class',
    template='plotly_dark'
)

In [10]:
fig = go.Figure()

to_plot = data_droped.Sex.replace({'male': 'Male', 'female': 'Female'}).value_counts()
labels = to_plot.index
values = to_plot.values

fig.add_trace(go.Pie(
    labels = labels,
    values = values,
    textinfo='label+percent'
))

fig.update_layout(
    title_text='Gender',
    template='plotly_dark'
)

In [11]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_droped.Age,
    name = 'Age'
))

fig.update_layout(
    title_text = 'Age box plot',
    template = 'plotly_dark'
)

This plot reveals that in Age column there is a lot of outliers which in reality aren't that bad, mostly because it's possible that some old lady was on the ship or very young boy. We definitly don't want to get rid of them. This is the reason why I chose median to fill missing values instead of mean. Median is much more resistant for outliers.

In [12]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_droped.Fare,
    name = 'Passenger fare'
))

fig.update_layout(
    title_text = 'Fare box plot',
    template = 'plotly_dark'
)

It's looks like we need to do something with outliers here, but first let's check if this is some kind of mistake or someone really paid that much.

In [13]:
data_droped[data_droped['Fare'] > 500]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
258,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,C
679,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,C
737,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,C


If we enter names of those people in google we can easily find that they really paid £512 for their tickets. Leaving them on can lead to confusion that's why I will delete them and consider them separately.

In [14]:
# Delete outliers
data_droped_out = data_droped[data_droped['Fare'] < 500]

In [15]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_droped_out.Fare,
    name = 'Passenger fare'
))

fig.update_layout(
    title_text = 'Fare box plot',
    template = 'plotly_dark'
)

It's still look not that good but it's better than before and we will leave it like this.

<h2>Questions</h2>
Now it's time for couple of interesting questions which we can answer using our data:
<ul>
<li>Does gender have a big impact on survival?</li>
<li>Were people saved by families?</li>
<li>How much money people have spent to participate in the cruise?</li>
</ul>

<h3> Does gender have a big impact on survival? </h3>

In [16]:
fig = go.Figure()

data_female_toplot = data_droped_out[data_droped_out['Sex'] == 'female']['Survived'].replace({0: 'Dead', 1: 'Survived'})
data_male_toplot = data_droped_out[data_droped_out['Sex'] == 'male']['Survived'].replace({0: 'Dead', 1: 'Survived'})

fig.add_trace(go.Histogram(
    x=data_female_toplot,
    name='Female',
    marker_color='#EB89B5',
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x=data_male_toplot,
    name='Male',
    marker_color='#330C73',
    opacity=0.75
))

fig.update_layout(
    title_text='Survivors by Gender',
    template='plotly_dark',
)

From the chart we see that gender played important factor in survival. It's clear that women were the first to enter the lifeboats.

<h3> Were people saved by families? </h3>
We want to check whether if one family member survived, it means that the rest of the family members were also saved. I'm assuming that people with the same ticket are from the same family.

In [17]:
# Add family size feature
data_droped_out['Family_size'] = data_droped_out['SibSp'] + data_droped_out['Parch']

In [18]:
data_droped_out

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family_size
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,0
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,0
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S,3
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,0


In [19]:
data_droped_out.groupby(['Family_size', 'Survived']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
Family_size,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,374,374,374,374,374,374,374,374,374
0,1,161,161,161,161,161,161,161,161,161
1,0,72,72,72,72,72,72,72,72,72
1,1,88,88,88,88,88,88,88,88,88
2,0,43,43,43,43,43,43,43,43,43
2,1,59,59,59,59,59,59,59,59,59
3,0,8,8,8,8,8,8,8,8,8
3,1,21,21,21,21,21,21,21,21,21
4,0,12,12,12,12,12,12,12,12,12
4,1,3,3,3,3,3,3,3,3,3


In [20]:
data_droped_out[data_droped_out['Family_size'] > 0].groupby(['Ticket', 'Survived']).count().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Family_size
Ticket,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
110413,0,1,1,1,1,1,1,1,1,1
110413,1,2,2,2,2,2,2,2,2,2
110813,1,1,1,1,1,1,1,1,1,1
111361,1,2,2,2,2,2,2,2,2,2
113503,0,1,1,1,1,1,1,1,1,1
113505,1,2,2,2,2,2,2,2,2,2
113509,0,1,1,1,1,1,1,1,1,1
113760,1,4,4,4,4,4,4,4,4,4
113773,0,1,1,1,1,1,1,1,1,1
113776,0,1,1,1,1,1,1,1,1,1


Maybe my guess that tickets are the same for all family members is correct but usually not every person from the family is listed in our dataset. It simply means that we don't know if they survived or died, that's why our question cannot be answered using this dataset. 

<h3>How much money people have spent to participate in the cruise?</h3>

In [21]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=data_droped_out['Fare'],
    name='Female',
    marker_color='#EB89B5',
    opacity=0.75
))

fig.update_layout(
    xaxis_title_text='Fare',
    yaxis_title_text='Number of people',
    title_text='Fare histogram',
    template='plotly_dark',
)

Most people paid a modest amount of £ 5 - £ 15. It means that most of the passengers resided in the third or second class. We could notice this result in a pie chart earlier which showed number of people in every class.