# Data prepping: cleaning & features engineering

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import string
from functions.utilities import *

In [2]:
#to be able to have multiple output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
#read the data
trainpath = './data/train.csv'
testpath = './data/test.csv'
traindf = pd.read_csv(trainpath, delimiter=",")
testdf = pd.read_csv(testpath, delimiter=",")
fulldf = traindf.merge(testdf, how="outer")

dfs = [fulldf, testdf, traindf]

#show the data set
fulldf.head()

#data type dictionary
data_type_dict={'PassengerId':'numeric',
                'Survived':'nominal',
                'Pclass':'ordinal',
                'Name':'nominal',
                'Sex':'nominal', 
                'Age':'numeric', 
                'SibSp':'numeric',
                'Parch':'numeric',
                'Ticket':'nominal',
                'Fare':'numeric', 
                'Cabin': 'nominal',
                'Embarked':'nominal'}

# check which columns have NA values
fulldf.isna().sum()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

## Name
### Getting the title from the name

In [4]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev','Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess', 'Don', 'Jonkheer']

female_title_list = ['Mrs', 'Miss', 'Ms', 'Mlle', 'Mme', 'Countess']
male_title_list = ['Mr', 'Master', 'Major', 'Rev', 'Col', 'Don', 'Jonkheer', 'Capt']
neutral_title_list = ['Dr']

#TODO remove captain from dataset as it was a choice and not related to its status

def addColumnTitle(df, title_list):
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
    data_type_dict['Title'] = 'nominal'
    
updateAllDataSets(addColumnTitle, dfs, title_list)

## Fare and category

In [5]:
#Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Unknown'] # i removed T because only in one person who is dead

def addColumnDeck(df, cabin_list):
    df.Cabin = df.Cabin.fillna('Unknown')
    df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    
updateAllDataSets(addColumnDeck, dfs, cabin_list)
#fulldf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck


## Finding the mean and variance

In [6]:
categories = [1, 2, 3]

# Description: compute the mean and std of the fram of each category
# Input: dataframe df, list of categories
# Output: list of tuples giving the mean and the variance for each cat
def statFaresCat(df, categories):
    statInfos = []
    for category in categories:
        dfCat = df.loc[df['Pclass'] == category]
        df_mean = dfCat.loc[:,"Fare"].mean()
        df_std = dfCat.loc[:,"Fare"].std()
        df_min = dfCat.loc[:,"Fare"].min()
        df_max = dfCat.loc[:,"Fare"].max()
        statInfos.append((df_mean, df_std, df_min, df_max))
    return statInfos 
                         
statFaresCat(fulldf, categories)

[(87.50899164086688, 80.44717824890337, 0.0, 512.3292),
 (21.1791963898917, 13.607122055536294, 0.0, 73.5),
 (13.302888700564973, 11.494358453448559, 0.0, 69.55)]

As we can see, some people have a fare of zero. Let's see how many.

In [7]:
for category in categories :
    print("# person whose fare is zero in category " + str(category) + ": " + str(len(fulldf.loc[(fulldf["Fare"]== 0) & (fulldf["Pclass"] == category)])))
    
fulldf.loc[(fulldf["Fare"]== 0) & fulldf["Cabin"].notna()]

# person whose fare is zero in category 1: 7
# person whose fare is zero in category 2: 6
# person whose fare is zero in category 3: 4


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck
179,180,0.0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,Unknown,S,Mr,Unknown
263,264,0.0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S,Mr,B
271,272,1.0,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,Unknown,S,Mr,Unknown
277,278,0.0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,Unknown,S,Mr,Unknown
302,303,0.0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,Unknown,S,Mr,Unknown
413,414,0.0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,Unknown,S,Mr,Unknown
466,467,0.0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,Unknown,S,Mr,Unknown
481,482,0.0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,Unknown,S,Mr,Unknown
597,598,0.0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,Unknown,S,Mr,Unknown
633,634,0.0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,Unknown,S,Mr,Unknown


### Inducing the fare from the category

In [8]:
def addFareGivenCat(df, category, categories, fulldf):
    statInfos = statFaresCat(fulldf,categories)

### Verifying the category from the fare

### Verifying members from a same family have the same category

### Getting the fare per person

## Output

In [9]:
testdf.to_csv("testdf-2.csv", index=False)
traindf.to_csv("traindf-2.csv",index=False)