# Analyse en Composantes Principales (ACP) - Décathlon

Analyse des profils d'athlètes de décathlon pour identifier des groupes similaires.

## Importation des biblio

In [12]:
#import des librairies nécessaires à notre analyse
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

#config visuelle
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Chargement des données

In [13]:
#chargement du CSV
df = pd.read_csv('decathlon.csv', sep=';')

#aperçu
df.head()

Unnamed: 0,Nom,100m,Longueur,Poids,Hauteur,400m,110m H,Disque,Perche,Javelot,1500m,Classement,Points,Competition
0,Sebrle,10.85,7.84,16.36,2.12,48.36,14.05,48.72,5.0,70.52,280.01,1,8893,JO
1,Clay,10.44,7.96,15.23,2.06,49.19,14.13,50.11,4.9,69.71,282.0,2,8820,JO
2,Karpov,10.5,7.81,15.93,2.09,46.81,13.97,51.65,4.6,55.54,278.11,3,8725,JO
3,Macey,10.89,7.47,15.73,2.15,48.97,14.56,48.34,4.4,58.46,265.42,4,8414,JO
4,Warners,10.62,7.74,14.48,1.97,47.97,14.01,43.73,4.9,55.39,278.05,5,8343,JO


## Exploration des données

In [14]:
#infos sur la data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Nom          41 non-null     object 
 1   100m         41 non-null     float64
 2   Longueur     41 non-null     float64
 3   Poids        41 non-null     float64
 4   Hauteur      41 non-null     float64
 5   400m         41 non-null     float64
 6   110m H       41 non-null     float64
 7   Disque       41 non-null     float64
 8   Perche       41 non-null     float64
 9   Javelot      41 non-null     float64
 10  1500m        41 non-null     float64
 11  Classement   41 non-null     int64  
 12  Points       41 non-null     int64  
 13  Competition  41 non-null     object 
dtypes: float64(10), int64(2), object(2)
memory usage: 4.6+ KB


In [15]:
#stats descriptives
df.describe()

Unnamed: 0,100m,Longueur,Poids,Hauteur,400m,110m H,Disque,Perche,Javelot,1500m,Classement,Points
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,10.998049,7.26,14.477073,1.976829,49.616341,14.605854,44.32561,4.762439,58.316585,279.024878,12.121951,8005.365854
std,0.263023,0.316402,0.824428,0.088951,1.153451,0.471789,3.377845,0.278,4.82682,11.673247,7.918949,342.385145
min,10.44,6.61,12.68,1.85,46.81,13.97,37.92,4.2,50.31,262.1,1.0,7313.0
25%,10.85,7.03,13.88,1.92,48.93,14.21,41.9,4.5,55.27,271.02,6.0,7802.0
50%,10.98,7.3,14.57,1.95,49.4,14.48,44.41,4.8,58.36,278.05,11.0,8021.0
75%,11.14,7.48,14.97,2.04,50.3,14.98,46.07,4.92,60.89,285.1,18.0,8122.0
max,11.64,7.96,16.36,2.15,53.2,15.67,51.65,5.4,70.52,317.0,28.0,8893.0


## Préparation des données pour l'ACP

In [16]:
#on vire les colonnes qui nous intéressent pas pour l'analyse
colonnes_a_exclure = ['Competition', 'Points', 'Classement']
df_acp = df.drop(columns=colonnes_a_exclure)

#on met les noms des athlètes de côté
# mais on index pour après pouvoir les retrouver
noms_athletes = df_acp['Nom']
df_acp = df_acp.set_index('Nom')

#aperçu avant acp, vu que c'est Joyce qui nous fournit les données
# on saute l'ETL
df_acp.head()

Unnamed: 0_level_0,100m,Longueur,Poids,Hauteur,400m,110m H,Disque,Perche,Javelot,1500m
Nom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sebrle,10.85,7.84,16.36,2.12,48.36,14.05,48.72,5.0,70.52,280.01
Clay,10.44,7.96,15.23,2.06,49.19,14.13,50.11,4.9,69.71,282.0
Karpov,10.5,7.81,15.93,2.09,46.81,13.97,51.65,4.6,55.54,278.11
Macey,10.89,7.47,15.73,2.15,48.97,14.56,48.34,4.4,58.46,265.42
Warners,10.62,7.74,14.48,1.97,47.97,14.01,43.73,4.9,55.39,278.05


## Vérification des valeurs manquantes

In [17]:
#check rapide pour voir si Joyce a bien fait son taf
df_acp.isnull().sum()

100m        0
Longueur    0
Poids       0
Hauteur     0
400m        0
110m H      0
Disque      0
Perche      0
Javelot     0
1500m       0
dtype: int64

## Standardisation des données