# Import des outils / jeu de données

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from column_names import quali_var, quanti_var, target

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/train.csv",
    index_col="id",
)

# Présentation

## Jeu de données

### Lecture des données

### Présentation des variables

In [None]:
df.head()

In [None]:
print(f"Il y a {df.shape[1]} variables et {df.shape[0]} individus.")

#### Variables

#### Conversion de type

In [None]:
df[quali_var] = (
    df[quali_var].astype(str).astype("category")
)  # todo: convertir uniquement les variables INT en category

In [None]:
X = df.copy()[quanti_var + quali_var]
y = df.copy()[target]

In [None]:
df[quanti_var].head()

In [None]:
df[quali_var].head()

In [None]:
df[target].head()

# Découverte des données

## Analyse univariée

In [None]:
df.nunique()

In [None]:
df[quanti_var].describe()

In [None]:
df[quali_var].describe()

# Visualisation

## Variables quantitatives

In [None]:
for var in quanti_var:
    _, ax = plt.subplots(1, 2, figsize=(8, 2))
    sns.boxplot(x=df[var], width=0.25, ax=ax[0])
    sns.histplot(df[var], kde=True, ax=ax[1])
    plt.show()

In [None]:
plt.figure(figsize=(8, 8))

TRESHOLD = 0.5

sns.heatmap(
    df[quanti_var].corr()[df[quanti_var].corr().abs() > TRESHOLD],
    annot=True,
    cmap="BrBG",
    linewidths=0.5,
    vmax=1,
    vmin=-1,
)

### Valeurs manquantes

In [None]:
df_na = df[quanti_var].isna()
df_na_sum = df_na.sum()

In [None]:
_, ax = plt.subplots(1, 2, figsize=(15, 4))

ax[0].set_title("Nombre de valeurs manquantes")
sns.barplot(x=df_na_sum.values, y=df_na_sum.index, color="C0", ax=ax[0])

ax[1].set_title("Valeurs manquantes")
sns.heatmap(df_na, cbar=False, ax=ax[1])

In [None]:
df[quanti_var].isna().sum()

## Variables qualitatives

In [None]:
for var in quali_var:
    if df[var].nunique() > 3:
        sns.histplot(y=df[var])
    else:
        plt.figure(figsize=(4, 2))
        sns.histplot(df[var], shrink=0.3)
    plt.show()

### Valeurs manquantes

In [None]:
df_na = df[quali_var] == "nan"
df_na_sum = df_na.sum()

In [None]:
_, ax = plt.subplots(1, 2, figsize=(15, 4))

ax[0].set_title("Nombre de valeurs manquantes")
sns.barplot(x=df_na_sum.values, y=df_na_sum.index, color="C0", ax=ax[0])

ax[1].set_title("Valeurs manquantes")
sns.heatmap(df_na, cbar=False, ax=ax[1])

In [None]:
df_na_sum

### Variable cible

In [None]:
sns.histplot(y=y.astype(str), shrink=0.3, stat="probability")
plt.show()

In [None]:
pd.concat((y.value_counts(), y.value_counts(normalize=True)), axis=1)

## Variable cible en fonction des autres variables

In [None]:
for var in quanti_var:
    _, ax = plt.subplots(1, 2, figsize=(10, 3))

    sns.boxplot(df, x=df[var], y=df[target], width=0.25, ax=ax[0])

    sns.histplot(
        df,
        x=df[var],
        kde=True,
        ax=ax[1],
        hue=df[target],
        stat="probability",
        common_norm=False,
    )

    plt.show()

In [None]:
for var in quali_var:
    _, ax = plt.subplots(1, 2, figsize=(10, 3))

    sns.histplot(
        df,
        x=df[var],
        hue=df[target],
        multiple="dodge",
        shrink=0.5,
        ax=ax[0],
    )

    sns.histplot(
        df,
        hue=df[var],
        x=df[target],
        multiple="dodge",
        shrink=0.5,
        ax=ax[1],
    )

    plt.show()

# Sauvegarde du Dataframe

In [None]:
df.to_csv("data/data.csv")