# Packages

In [1]:
import pandas as pd
import numpy as np
import chardet
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing

## Importation

In [2]:
with open('data/train.csv', 'rb') as f:
    result = chardet.detect(f.read())
print(result['encoding'])

data = pd.read_csv('data/train.csv', delimiter = ",",decimal = ".", encoding = result['encoding'])

ascii


## Visualisation

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
# Dropping PassengerId, Name and Ticket columns
df = data.drop(["PassengerId", "Name", "Ticket"], axis = 1)

In [9]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


In [7]:
print(f"Data Dimension: {df.shape} \nNumber single obervation:\n {df.nunique(axis=0)}")

Data Dimension: (891, 9) 
Number single obervation:
 Survived      2
Pclass        3
Sex           2
Age          88
SibSp         7
Parch         7
Fare        248
Cabin       147
Embarked      3
dtype: int64


So have six qualitatif variables: Survived(dependance variable), Pclass, Sex, SibSp, Parch and Embarked. Then, we're going to numerize these variables. The variable Cabin is so particular, indeed we can't consider it like a quantitatif variable. we can delete it for the first time.

In [10]:
df = df.drop(['Cabin'], axis = 1)

df['Sex'] = df['Sex'].replace({'male': '1', 'female': '0'})
df['Embarked'] = df['Embarked'].replace({'C': 'O', 'S': '1', 'Q':'2'})

In [11]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,1
1,1,1,0,38.0,1,0,71.2833,O
2,1,3,0,26.0,0,0,7.9250,1
3,1,1,0,35.0,1,0,53.1000,1
4,0,3,1,35.0,0,0,8.0500,1
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,1
887,1,1,0,19.0,0,0,30.0000,1
888,0,3,0,,1,2,23.4500,1
889,1,1,1,26.0,0,0,30.0000,O
