In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [4]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
df = train.copy()

In [5]:
df.head()

In [6]:
test.head()

In [7]:
print('shape of train data', df.shape)
print('shape of test data', test.shape)

In [8]:
df.info()

1. There are 12 features of which 1 Survived is the target feature
2. Some of the features have null values which need to dealt with.
3. Some features are numeric while some are non numeric.

In [38]:
# Checking if there are any duplicate values
df.duplicated().any()

There are no duplicate rows

In [9]:
# Checking the null values
round(100*df.isnull().sum()/len(df), 2)

20% of the values in Age feature are null, whereas almost 80% of the data is missing in Cabin feature and Embarked has a mere 0.22% data missing.

In [10]:
# Checking summary statistics
df.describe()

Fare feature seems to be right skewed as the mean value is more than double of its median value(50th percentile 50%)

In [11]:
# Checking the distribution of the target variable
d_tv = round(100 * df['Survived'].value_counts(normalize = True), 2)
d_tv

In [12]:
# Visualizing distribution of target variable
sns.barplot(x = d_tv.index, y = d_tv.values)
plt.xlabel('Survived')
plt.ylabel('Percentage Survived')
plt.show()

There is class imbalance in the dataset. Not Survived are approx 62% while survived are only 38%.

In [13]:
# Separating numeric and categorical features
df_num = [columns for columns in df.columns if df[columns].dtypes != 'O']
df_cat = [columns for columns in df.columns if df[columns].dtypes == 'O']

In [14]:
print('Number of numeric features: ', len(df_num))
print('Number of categorical features: ', len(df_cat))

In [15]:
print('Numeric features: ', df_num)
print('Categorical features', df_cat)

In [16]:
# Visualize distribution of numeric features
for i in df_num:
    sns.displot(x = i, data = df)
    plt.show()

1. Passenger Id just uniquely differentiates each passenger for which index can also be used so PassengerId feature will be dropped.
2. Except Age and Fare other features have just a few distinct values which can be considered as categorical features only.
3. Fare as we had expected from the summary statistics is right skewed and would be normalized before feeding it to the ML model.
4. Age seems to be fairly normally distributed.

In [17]:
# Separating discrete numeric features as categorical features
dis_num_cat = set(df_num) - set(['Age', 'Fare']) - set(['PassengerId', 'Survived'])
dis_num_cat

In [18]:
# Visualizing categorical features
for i in df_cat:
    sns.countplot(x = i, data = df)
    plt.show()

1. Sex and Embarked features seem to be useful

In [19]:
# Checking counts for Sex and Embarked features
cat = ['Sex', 'Embarked']
for i in cat:
    print(round(100*df[i].value_counts(normalize = True), 2))
    print('-' * 100)
    

In [20]:
# Visualizing discrete numeric features
for i in dis_num_cat:
    print(round(100*df[i].value_counts(normalize = True), 2))
    sns.countplot(x = i, data = df)
    plt.show()

1. 76% of the passengers had 0 parents/children aboard the ship.
2. Almost half of passengers had 3rd class ticket.
3. Almost 70% of passengers had 0 siblings/spouses aboard the titanic.

## Bivariate Analysis

In [21]:
cat = ['Sex', 'Embarked']
for i in cat:
    print(pd.crosstab(df[i], df['Survived']))

In [22]:
df.groupby(['Sex', 'Survived'])['Age'].mean()

In [36]:
es = df.groupby(['Embarked', 'Survived'])['Age'].mean()
es

In [24]:
df.groupby('Survived')['Age'].mean()

In [25]:
df.groupby('Survived')['Fare'].mean()

1. Those survived had paid an average fair of 48 which is more than double of those who didn't survive.

In [26]:
df.groupby('Survived')['Fare'].median()

Even the median Fare for those survived was more than double of those who didn't survive.

In [27]:
for i in dis_num_cat:
    print(pd.crosstab(df[i], df['Survived']))
    print('-' * 100)

In [28]:
for i in dis_num_cat:
    print(df.groupby([i, 'Survived'])['Age'].mean())
    print('-' * 100)

In [29]:
for i in dis_num_cat:
    print(df.groupby([i, 'Survived'])['Fare'].mean())
    print('-' * 100)

In [34]:
# Checking boxplot for Age and Fare variable
sns.boxplot(x = 'Age', data = df)
plt.show()

In [35]:
sns.boxplot(x = 'Fare', data = df)
plt.show()

There are a substantial number of outliers in Fare feature which explains for its right skew and a few outliers in Age feature too.

In [32]:
# Correlation matrix
plt.figure(figsize = (20, 20))
sns.heatmap(df.corr(), cbar=True, fmt='.1f', annot=True, cmap='Blues')
plt.show()