# Analyse exploratoire univariée des données

**Auteur:** Louis Vanacker

**Date:** 5 janvier 2026

**Objectif:** Analyser les variables individuelles (distributions, valeurs aberrantes, tendances).

In [None]:
# Import des bibliothèques
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display

# Configuration du style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Chargement des données
url = "https://raw.githubusercontent.com/Dorsumsellae/Programmation-avancee-Projet-d-examen-Students-Performance-in-Exams/main/data/raw/StudentsPerformance.csv"
df = pd.read_csv(url)

print(f"Dimensions du dataset : {df.shape}")
display(df.head(10))

## 1. Analyse des variables catégorielles

In [None]:
categorical_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

print('=== VARIABLES CATÉGORIELLES ===')
for col in categorical_cols:
    print(f"\n{col.upper()}")
    counts = df[col].value_counts()
    display(counts.to_frame('Effectif'))
    mode = df[col].mode()[0]
    freq = counts.iloc[0]
    print(f"Valeur la plus fréquente : {mode} ({freq}, {freq/len(df)*100:.1f}%)")

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(categorical_cols):
    counts = df[col].value_counts()
    axes[i].bar(range(len(counts)), counts.values)
    axes[i].set_xticks(range(len(counts)))
    axes[i].set_xticklabels(counts.index, rotation=45, ha='right')
    axes[i].set_title(f'Distribution de {col}')
    axes[i].set_ylabel('Effectif')

fig.delaxes(axes[-1])
plt.tight_layout()
plt.show()

## 2. Analyse des variables numériques (scores)

In [None]:
score_cols = ['math score', 'reading score', 'writing score']

print('=== STATISTIQUES DES SCORES ===')
display(df[score_cols].describe())

for col in score_cols:
    print(f"\n{col.upper()}")
    print(f"Moyenne : {df[col].mean():.2f}")
    print(f"Médiane : {df[col].median():.2f}")
    print(f"Écart-type : {df[col].std():.2f}")
    print(f"Min / Max : {df[col].min()} / {df[col].max()}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(score_cols):
    axes[i].hist(df[col], bins=20)
    axes[i].axvline(df[col].mean(), linestyle='--', label='Moyenne')
    axes[i].axvline(df[col].median(), linestyle=':', label='Médiane')
    axes[i].set_title(col)
    axes[i].legend()

plt.tight_layout()
plt.show()

## 3. Détection des valeurs aberrantes

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(score_cols):
    axes[i].boxplot(df[col])
    axes[i].set_title(col)

plt.tight_layout()
plt.show()

In [None]:
print('=== DÉTECTION DES VALEURS ABERRANTES (IQR) ===')

for col in score_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    print(f"\n{col.upper()} : {len(outliers)} valeurs aberrantes ({len(outliers)/len(df)*100:.2f}%)")