From Local

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("titanic_train.csv")
df.head(6)

Identify numerical and categorical features

In [None]:
df.dtypes

In [None]:
df['SibSp'].unique()
df['Parch'].unique()
df['Embarked'].unique()

In [None]:
#not all int values are numerical
categorical = ['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']
numerical = [x for x in df.columns if x not in categorical]
numerical

Identify missing values and visualize them

In [None]:
df.isnull().mean()

In [None]:
#function to calculate the percentage of missing values in each feature and plot a heatmap of missing values
import missingno as msno

def missingValPercentage(df):
    mean = df.isnull().mean().sort_values(ascending=False)
    sum = df.isnull().sum()
    rowCount = df.shape[0]
    for x in mean.index:
        if mean[x] != 0:
            print("{:.2f}".format(mean[x]*100), r"% of values missing in", x, sum[x],'/', rowCount)
    return msno.heatmap(df,figsize=(10,6))

In [None]:
missingValPercentage(df)

Identifying the outliers

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.plot.box(grid=True)

In [None]:
import seaborn as sns

sns.boxplot(x='Embarked', y='Age', data=df)

Handling the missing values of numerical features

In [None]:
#let us look into the numerical feature with the highest number of missingvalue i.e., Age
#we will try to fill the NaN values with mean, median and mode and select the best way of imputation

import matplotlib.pyplot as plt
%matplotlib inline
#function to visualize the distribution (kde) of original feature vs feature imputed in various ways
def kdePlot(*arguments):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for feature in arguments:
        df[feature].plot(kind='kde', ax=ax)
        #df[imputed_feature].plot(kind='kde', ax=ax, color='red')
    lines, labels = ax.get_legend_handles_labels()
    ax.legend(lines, labels, loc='best')


In [None]:

#mean imputation
df['Age_Mean'] = df['Age'].fillna(df['Age'].mean())
kdePlot('Age','Age_Mean')
df['Age'].mean()

In [None]:
#mode imputation
df['Age_Mode'] = df['Age'].fillna(df['Age'].mode()[0])
kdePlot('Age','Age_Mode')
df['Age'].mode()[0]

In [None]:
#median imputation
df['Age_Median'] = df['Age'].fillna(df['Age'].median())
kdePlot('Age','Age_Median')
df['Age'].median()

In [None]:
#random sample imputation
df.isnull().sum()

In [None]:
def impute_nan(df,variable,median):
    df[variable+"_random"]=df[variable]
    print(df['Age_random'])
    ##It will have the random sample to fill the na
    random_sample=df[variable].dropna().sample(df[variable].isnull().sum(),random_state=0) #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html
    ##pandas need to have same index in order to merge the dataset
    random_sample.index=df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable+'_random']=random_sample

In [None]:
impute_nan(df, 'Age', df['Age'].mean())

In [None]:
kdePlot('Age_Median', 'Age_random', 'Age', 'Age_Median', 'Age_Mean')

In [None]:
#Capturing NAN values with a new feature

df['Age_New_Feature'] = np.where(df['Age'].isnull(),1,0)

In [None]:
df['Age_New_Feature'].sum()#sum is equal to number of NaN values as NaN is now 1 and non NaN is 0

In [None]:
kdePlot('Age_New_Feature')

In [None]:
#End of Distribution imputation

df['Age'].hist(bins=50)
extreme_value = df['Age'].mean()+3*df['Age'].std()
df['Age_EndOf_Distribution'] = df['Age'].fillna(extreme_value)

In [None]:
kdePlot('Age_Median', 'Age_EndOf_Distribution')

In [None]:
#w.r.t. end of disrtibution imputation, lets compare the outliers before and after imputation

sns.boxplot('Age', data=df).set_title("Before Imputation")

In [None]:
sns.boxplot('Age_EndOf_Distribution', data=df).set_title("After End of Distribution Imputation")

In [None]:
#Arbitrary Value Imputation
#lets take an arbitrary value of 90

df['Age_Arbitrary'] = df['Age'].fillna(90)

In [None]:
kdePlot('Age', 'Age_Arbitrary')

In [None]:
#w.r.t. arbitrary value imputation, lets compare the outliers before and after imputation

sns.boxplot('Age', data=df).set_title("Before Imputation")

In [None]:
sns.boxplot('Age_Arbitrary', data=df).set_title("After Arbitrary Value Imputation")

In [None]:
#from above plots - though the outliers are treated, the distribution after imputation is not a perfect gaussian distribution

Handling the missing values of categorical features

In [None]:
#frequent value imputation
#lets take the categorical feature - Embarked

df.Embarked.unique()

In [None]:
#lets find the most frequestly occuring cabin class and use it to fill the NaN values
def freqValue(df,feature):
    print("\"",df[feature].value_counts().index[0], "\"", "is the most frequent value in {} column with".format(feature), df[feature].value_counts()[0], "occurences")

In [None]:
freqValue(df, 'Embarked')

In [None]:
#lets capture the importance or the info of the NaN values in cabin feature in a new feature - Embarked_NaN
#it will help us to analyse if there is a correlation between missing of these values with any other feature even after we replace the NaN values

def captureNaN(df, feature):
    df[feature+"_NaN"] = np.where(df[feature].isnull(), 1, 0)

captureNaN(df,'Embarked')

In [None]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
df['Embarked'].isnull().sum()


In [None]:
#Replacing NAN with a new category
#Suppose if we have more frequent categories, we just replace NAN with a new category (e.g.: replace NaN with 'Missing')
#lets apply this on Cabin as it has 77% of missing values
#creating a new feature Cabin_Class using the cabin category makes more sense than the cabin number

df['Cabin_Class'] = df['Cabin'].str[0]
df.Cabin_Class.unique()

df['Cabin_Class'].fillna('Missing', inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

Checking and handling if the dataset is imbalanced

In [None]:
#check the unique categories in the depenadant(target) column and find their count to check if its balanced
def checkIfBalanced(df, targetFeature):
    valueCounts=df[targetFeature].value_counts()
    totalCount=len(df[targetFeature])
    for idx in valueCounts.index:
        print(idx, "occurs", valueCounts[idx], "times ({:.2f}".format((valueCounts[idx]/totalCount)*100), "%)")


In [None]:
checkIfBalanced(df, 'Survived')

In [None]:
#for time being, considering this to be a bnalanced dataset
#Ideal balanced dataset for a binary classification is 50% + 50%

Treating the outliers