# Import des outils / jeu de données

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
np.random.seed(0)
sns.set_theme()

In [3]:
ID_COL = "PassengerId"
TARGET = "Survived"

In [4]:
df = pd.read_csv(
    "data/train.csv",
    index_col=ID_COL,
)

# Présentation

## Jeu de données

### Lecture des données

### Présentation des variables

In [5]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [26]:
print(f"Il y a {df.shape[1]} variables et {df.shape[0]} individus.")

In [27]:
df.nunique()

In [28]:
var_quanti = [
    "Age",
    "Fare",
]

In [None]:
var_quali = [
    "Pclass",
    "Sex",
    "SibSp",
    "Parch",
    "Cabin",
    "Embarked",
    TARGET,
]

#### Conversion de type

In [None]:
df[var_quali] = df[var_quali].astype(str).astype("category")  # todo: convertir uniquement les variables INT en category

In [None]:
df = df.reset_index(drop=True)  # bug autrement

#### Variables

In [None]:
X = df.copy()
y = X.pop(TARGET)

In [None]:
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
y.head()

# Analyse univariée

In [14]:
df[var_quanti].describe()

Unnamed: 0,Age,Fare
count,714.0,891.0
mean,29.699118,32.204208
std,14.526497,49.693429
min,0.42,0.0
25%,20.125,7.9104
50%,28.0,14.4542
75%,38.0,31.0
max,80.0,512.3292


In [15]:
df[var_quali].describe()

# Visualisation

## Variables quantitatives

In [16]:
for var in var_quanti:
    _, ax = plt.subplots(1, 2, figsize=(8, 2))
    sns.boxplot(x=df[var], width=0.25, ax=ax[0])
    sns.histplot(df[var], kde=True, ax=ax[1])
    plt.show()

In [17]:
plt.figure(figsize=(8, 8))

TRESHOLD = 0.5

sns.heatmap(
    df[var_quanti].corr()[df[var_quanti].corr().abs() > TRESHOLD],
    annot=True,
    cmap="BrBG",
    linewidths=0.5,
    vmax=1,
    vmin=-1,
)

### Valeurs manquantes

In [18]:
_, ax = plt.subplots(1, 2, figsize=(15, 4))

ax[0].set_title("Nombre de valeurs manquantes")
df_na = df[var_quanti].isna().sum()
sns.barplot(x=df_na.values, y=df_na.index, color="C0", ax=ax[0])

ax[1].set_title("Valeurs manquantes")
sns.heatmap(df[var_quanti].isna(), cbar=False, ax=ax[1])

In [19]:
df[var_quanti].isna().sum()

## Variables qualitatives

In [20]:
for var in var_quali:
    if df[var].nunique() > 3:
        sns.histplot(y=df[var])
    else:
        plt.figure(figsize=(4, 2))
        sns.histplot(df[var], shrink=0.3)
    plt.show()

### Valeurs manquantes

In [21]:
_, ax = plt.subplots(1, 2, figsize=(15, 4))

ax[0].set_title("Nombre de valeurs manquantes")
df_na = df[var_quali].isna().sum()
sns.barplot(x=df_na.values, y=df_na.index, color="C0", ax=ax[0])

ax[1].set_title("Valeurs manquantes")
sns.heatmap(df[var_quali].isna(), cbar=False, ax=ax[1])

In [22]:
df[var_quali].isna().sum()

## Variable cible

In [23]:
sns.histplot(y=y.astype(str), shrink=0.3, stat="probability")
plt.show()

In [24]:
pd.concat((
    y.value_counts(),
    y.value_counts(normalize=True)
), axis=1)

## Variable cible en fonction des autres variables

In [25]:
for var in X.columns:
    _, ax = plt.subplots(1, 2, figsize=(10, 2))
    sns.boxplot(x=X[var], y=y.astype(str), width=0.25, ax=ax[0])
    sns.histplot(
        x=X[var],
        kde=True,
        ax=ax[1],
        hue=y.astype(str),
        stat="probability",
        common_norm=False,
    )
    plt.show()

In [None]:
for var in var_quanti:
    _, ax = plt.subplots(1, 2, figsize=(10, 3))

    sns.boxplot(df, x=df[var], y=df[TARGET], width=0.25, ax=ax[0])
    sns.histplot(
        df,
        x=df[var],
        kde=True,
        ax=ax[1],
        hue=df[TARGET],
        stat="probability",
        common_norm=False,
    )
    plt.show()

In [None]:
for var in var_quali:
    _, ax = plt.subplots(1, 2, figsize=(10, 3))

    sns.histplot(
        df,
        x=df[var],
        hue=df[TARGET],
        multiple="dodge",
        shrink=0.5,
        ax=ax[0],
    )
    sns.histplot(
        df,
        hue=df[var],
        x=df[TARGET],
        multiple="dodge",
        shrink=0.5,
        ax=ax[1],
    )

    plt.show()

# Sauvegarde du Dataframe

In [None]:
df.to_csv("data/data.csv")