# Importing Libraries

In [None]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

from missingno import nullity_filter, nullity_sort

# Visualization 
import matplotlib.pyplot as plt
import missingno as msno
import matplotlib as mpl
from matplotlib import gridspec
import seaborn as sns
plt.style.use('seaborn-whitegrid')

from scipy.cluster import hierarchy

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

# Data Loading

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

# Data Inspection
![data](https://i.imgur.com/AC9Bq63.png)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

# Data Cleaning

## Data Formats

In [None]:
train['Sex'] = train['Sex'].map({'male':1, 'female': 0})

In [None]:
train.head()

## Removing Missing Values

In [None]:
#Inspecting missing values.
train.isnull().sum()

In [None]:
msno.matrix(train)

In [None]:
# show the missing values in dataset with ratio

def missing_values_table(df):
    
    na_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (df[na_columns].isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio,2)], axis=1, keys=['n_miss', 'ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

missing_values_table(train)

In [None]:
train = train.drop(columns = ["Cabin"])
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])

In [None]:
train.isnull().sum()

In [None]:
train.head()

In [None]:
train.duplicated().sum()

## Removing Noises

### Scaling Age

In [None]:
from sklearn.preprocessing import StandardScaler
# scale the train ages

scaler = StandardScaler()
train.Age = scaler.fit_transform(train.Age.values.reshape(-1,1))

In [None]:
# visualize the scaled ages distributions of train and test
# hint: you can use sns.distplot of the opensource tool seaborn
plt.figure(figsize=(20,5))
sns.distplot(train.Age.values, label="train", color="darkorange")
plt.xlabel("Scaled age values");
plt.ylabel("Density")
plt.legend();

### Scaling Fare

In [None]:
# create some normally distributed samples:
original = np.random.normal(loc=0, scale=1, size=200)
# add an outlier
shifted = np.array(original.tolist() + [1000])
# comute the mean
print(np.mean(original))
print(np.mean(shifted))

In [None]:
# Just a method to plot our fare distribution with some statistics
def show_fare_distribution():
    plt.figure(figsize=(20,5))
    sns.kdeplot(train[train.Survived==0].Fare, color="Blue", shade=True)
    sns.kdeplot(train[train.Survived==1].Fare, color="Green", shade=True)
    plt.axvline(np.max(train.Fare.values), color="Yellow")
    plt.axvline(np.min(train.Fare.values), color="Yellow")
    plt.axvline(np.mean(train.Fare.values)+np.std(train.Fare.values), color="Orange")
    plt.axvline(np.mean(train.Fare.values)-np.std(train.Fare.values), color="Orange")
    plt.axvline(np.mean(train.Fare.values), color="Red")
    plt.axvline(np.median(train.Fare.values), color="Black")
    plt.xlabel("Fare")
    plt.ylabel("Density")
    return plt

In [None]:
show_fare_distribution()

In [None]:
# perform a log transformation of the fare features in train and test! 
# Hint: You can use pandas apply method, for example: train.Fare = train.Fare.apply(lambda l: some method(l))
train.Fare = train.Fare.apply(lambda l: np.log(l+1))
test.Fare = test.Fare.apply(lambda l: np.log(l+1))
show_fare_distribution()

In [None]:
train.Fare = scaler.fit_transform(train.Fare.values.reshape(-1,1))
test.Fare = scaler.transform(test.Fare.values.reshape(-1,1))

In [None]:
train.head()

## Outliers

In [None]:
#Outlier Analysis Function
#Tukey's method
def tukeys_method(df, variable):
    #Takes two parameters: dataframe & variable of interest as string
    q1 = df[variable].quantile(0.25)
    q3 = df[variable].quantile(0.75)
    iqr = q3-q1
    inner_fence = 1.5*iqr
    outer_fence = 3*iqr

    #inner fence lower and upper end
    inner_fence_le = q1-inner_fence
    inner_fence_ue = q3+inner_fence

    #outer fence lower and upper end
    outer_fence_le = q1-outer_fence
    outer_fence_ue = q3+outer_fence

    outliers_prob = []
    outliers_poss = []
    for index, x in enumerate(df[variable]):
        if x <= outer_fence_le or x >= outer_fence_ue:
            outliers_prob.append(index)
    for index, x in enumerate(df[variable]):
        if x <= inner_fence_le or x >= inner_fence_ue:
            outliers_poss.append(index)
    return outliers_prob, outliers_poss

In [None]:
quant_df = train[["Age", "SibSp", "Parch", "Fare"]]
quant_df.head()

In [None]:
for col in quant_df:
  print(col, tukeys_method(quant_df, col))

In [None]:
for column in quant_df:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=quant_df, orient="h", palette="Set2", x=column)

In [None]:
for col in quant_df:
  q1 = quant_df[col].quantile(0.25)
  q3 = quant_df[col].quantile(0.75)
  iqr = q3 - q1
  whisker_width = 1.5
  lower_whisker = q1 - (whisker_width * iqr)
  upper_whisker = q3 + whisker_width * iqr
  quant_df[col] = np.where(quant_df[col] > upper_whisker, upper_whisker, np.where(quant_df[col] < lower_whisker, lower_whisker, quant_df[col]))

In [None]:
for column in quant_df:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=quant_df, orient="h", palette="Set2", x=column)

In [None]:
quant_df.head()

In [None]:
quant_df["SibSp"].value_counts()

In [None]:
quant_df["SibSp"] = quant_df["SibSp"].astype("int64")
quant_df["Parch"] = quant_df["Parch"].astype("int64")

In [None]:
train[["Age", "SibSp", "Parch", "Fare"]] = quant_df[["Age", "SibSp", "Parch", "Fare"]]
train.head()

In [None]:
train_clean = train.copy()
train_clean.head()

In [None]:
train_clean = train_clean.to_csv("train_clean.csv", index=False)