# Titanic Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline 

titanic = pd.read_csv("data/titanic.csv")
titanic.head()

# Univariate Analysis (Column)

In [None]:
titanic.columns

In [None]:
titanic.dtypes

### Types
Nominal: Survived, Sex, Embarked, SibSp, Parch

Ordinal: Pclass

Numerical: Age, Fare

Other: Name, Ticket, Cabin

In [None]:
#How many people survived in each class
pd.crosstab(titanic["Pclass"], titanic["Survived"])

In [None]:
#The survive-rate in each class
titanic[["Pclass", "Survived"]].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
#Survied Rates by Sex
titanic[["Sex", "Survived"]].groupby(["Sex"], as_index = False).mean().sort_values(by="Survived", ascending=False)

## Missing Value Treatement

In [None]:
titanic.isnull().sum()

In [None]:
titanic.Age.fillna(value=titanic.Age.mean(), inplace=True)
titanic.Embarked.fillna(value=(titanic.Embarked.value_counts().idxmax()), inplace=True)

titanic.head()

In [None]:
titanic.isnull().sum()

## Outlier Treatement

In [None]:
titanic.Age.plot.box()

In [None]:
titanic.SibSp.plot.box()

In [None]:
titanic.Parch.plot.box()

In [None]:
titanic.Fare.plot.box()

In [None]:
print(titanic.Fare.describe())

f, ax = plt.subplots(1, 2, figsize=(15,5))

titanic.Fare.plot.box(ax =ax[0])
titanic.Fare.plot.hist(bins = 10 , ax = ax[1])

In [None]:
from collections import Counter

def detect_outliers(df,n,features):
    outlier_indexes=[]
    
    for col in features:
        Q1=np.percentile(df[col],25)
        Q3=np.percentile(df[col],75)
        IQR=Q3-Q1
        step=IQR*1.5
        outlier_list_col = df[(df[col] < Q1 - step) | (df[col] > Q3 + step )].index
        outlier_indexes.extend(outlier_list_col)
        
    outlier_indexes=Counter(outlier_indexes)
    multiple_outliers = list( k for k, v in outlier_indexes.items() if v > n )
    
    return multiple_outliers

Outliers_to_drop = detect_outliers(titanic,2,["Age","SibSp","Parch","Fare"])
titanic.loc[Outliers_to_drop] # Show the outliers rows

In [None]:
titanic.shape

In [None]:
titanic = titanic.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

In [None]:
titanic.shape

## Feature Engineering

In [None]:
titles = pd.DataFrame(titanic.apply(lambda x: x.Name.split(",")[1].split(".")[0], axis=1), columns=["Title"])
print(pd.Categorical(titles.Title))
titanic = titanic.join(titles)

In [None]:
titanic.head()

In [None]:
titanic["Title"] = titanic["Title"].replace(" Mr", "Mr")
titanic["Title"] = titanic["Title"].replace(" Miss", "Miss")
titanic["Title"] = titanic["Title"].replace(" Mrs", "Mrs")
titanic["Title"] = titanic["Title"].replace(" Master", "Master")

titanic["Title"] = titanic["Title"].replace(" Dona", "Other")
titanic["Title"] = titanic["Title"].replace(" Mlle", "Miss")
titanic["Title"] = titanic["Title"].replace(" Mme", "Miss")
titanic["Title"] = titanic["Title"].replace(" Ms", "Miss")
titanic["Title"] = titanic["Title"].replace(" Capt", "Mr")
titanic["Title"] = titanic["Title"].replace(" Col", "Mr")
titanic["Title"] = titanic["Title"].replace(" Countess", "Mrs")
titanic["Title"] = titanic["Title"].replace(" Don", "Mr")
titanic["Title"] = titanic["Title"].replace(" Dr", "Mr")
titanic["Title"] = titanic["Title"].replace(" Jonkheer", "Other")
titanic["Title"] = titanic["Title"].replace(" Lady", "Mrs")
titanic["Title"] = titanic["Title"].replace(" Major", "Mr")
titanic["Title"] = titanic["Title"].replace(" Rev", "Other")
titanic["Title"] = titanic["Title"].replace(" Sir", "Mr")
titanic["Title"] = titanic["Title"].replace(" the Countess", "Mrs")

In [None]:
titanic.Title.value_counts()

In [None]:
# Calculating family size and adding column...
fsiz = pd.DataFrame(titanic.apply(lambda x: x.SibSp+x.Parch, axis=1), columns=["FSize"])
titanic = titanic.join(fsiz)

In [None]:
titanic.head()

In [None]:
titanic.FSize.value_counts()

In [None]:
print("So if the oldest Age was 80. We just devide this into 5 groups ->",80/5 )


titanic.loc[titanic["Age"] <= 16, "Age_group"] = 0
titanic.loc[(titanic["Age"] > 16) & (titanic["Age"] <=32), "Age_group"] = 1
titanic.loc[(titanic["Age"] > 32) & (titanic["Age"] <=48), "Age_group"] = 2
titanic.loc[(titanic["Age"] > 48) & (titanic["Age"] <=64), "Age_group"] = 3
titanic.loc[(titanic["Age"] > 64), "Age_group"] = 4

titanic.head()

In [None]:
# train_df[["Age_group", "Survived"]].groupby(["Age_group"]).mean().sort_values(by = "Survived")
print(titanic.Age_group.value_counts(ascending = False))

f, ax = plt.subplots(1,2, figsize = (20,5))
titanic.Age_group.value_counts().plot.bar( ax = ax [0])
sns.countplot("Age_group", hue = "Survived", data = titanic, ax = ax[1])

In [None]:
titanic['Fare_range'] = pd.qcut(titanic['Fare'], 4)

titanic[["Fare_range", "Survived"]].groupby(["Fare_range"], as_index = False).mean().sort_values(by = "Fare_range", ascending = True)

In [None]:
titanic.head()

In [None]:
#Feature Selecting


titanic = titanic.drop(['Name','Ticket', 'Cabin'], axis=1)

# no need for the following as the sum is used
titanic.drop('Parch', axis=1, inplace=True)
titanic.drop('SibSp', axis=1, inplace=True)

In [None]:
titanic.head()

In [None]:
## Data Preparation

In [None]:
titanic["Sex"].replace(["male", "female"], [0, 1], inplace = True)
titanic["Embarked"].replace(["S", "C", "Q"], [0, 1, 2], inplace = True)
titanic["Title"].replace(["Master", "Miss", "Mr", "Mrs", "Other"], [0, 1, 2, 3, 4], inplace = True)

In [None]:
titanic.head()

In [None]:
sns.heatmap(titanic.corr(), annot = True)
plt.show()

In [None]:
g = sns.heatmap(titanic[["Survived","FSize","Age","Fare", "Pclass"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")