# Imports

In [53]:
import pandas as pd

# Load data

In [65]:
def load_data(file_path):
    """
    Load data from a CSV file into a pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded data as a pandas DataFrame.
    """
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully from {file_path}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

file_path = "/Users/bgdu69/Documents/Hackermann/DataMining/DataMining_Project/joueurs_ligue1_2024_2025.csv"

data = load_data(file_path)

data.head()

Data loaded successfully from /Users/bgdu69/Documents/Hackermann/DataMining/DataMining_Project/joueurs_ligue1_2024_2025.csv


Unnamed: 0,equipe,player_id,player_name,player_country_code,age,positions,matches_played,starts,min,gls,...,fls_com,fls_drawn,offside,pk_won,pk_conceded,og,ball_recov,air_dual_won,air_dual_lost,pct_air_dual_won
0,Angers,c3e4ecbb,Jordan Lefort,FRA,30.0,DF,34,34,3060.0,0.0,...,14.0,15.0,1.0,0.0,1.0,0.0,110.0,40.0,28.0,58.8
1,Angers,9d420dad,Yahia Fofana,CIV,23.0,GK,33,33,2970.0,0.0,...,0.0,11.0,0.0,0.0,0.0,0.0,36.0,6.0,3.0,66.7
2,Angers,7978cbf0,Himad Abdelli,ALG,24.0,"MF,FW",32,32,2842.0,6.0,...,48.0,32.0,3.0,0.0,0.0,0.0,193.0,13.0,20.0,39.4
3,Angers,6bfb4303,Florent Hanin,ALG,34.0,DF,33,31,2672.0,1.0,...,16.0,35.0,1.0,0.0,0.0,0.0,132.0,15.0,28.0,34.9
4,Angers,48b28bfd,Jean-Eudes Aholou,CIV,30.0,MF,26,26,1959.0,2.0,...,50.0,25.0,1.0,0.0,0.0,0.0,100.0,38.0,23.0,62.3


## Filatrage initial

In [68]:
# ON GARDE LES JOURS AYANT JOUÉ PLUS DE X MINUTES

seuil = 500 #nb de minutes data_clean

col_minutes = 'min'
data[col_minutes] = pd.to_numeric(data[col_minutes], errors='coerce')
data_cleaned = data.dropna(subset=[col_minutes])

data_clean = data_cleaned[data_cleaned[col_minutes] > seuil]

print(f"Number of players with more than {seuil} minutes played: {data_clean.shape[0]}")

# DETECTION DES DOUBLONS

if data_clean.duplicated(subset=["player_id"]).any():
    n_dupes = data_clean.duplicated(subset=["player_id"]).sum()
    print(f"{n_dupes} doublons trouvés sur 'player_id'.")
    data_clean = (
        data_clean.sort_values(by="min", ascending=False)
            .drop_duplicates(subset=["player_id"], keep="first")
    )
else:
    print("Aucun doublon détecté sur player_id ✅")

print(f"Number of players with more than {seuil} minutes played & without duplicates: {data_clean.shape[0]}")

Number of players with more than 500 minutes played: 361
5 doublons trouvés sur 'player_id'.
Number of players with more than 500 minutes played & without duplicates: 356


In [7]:
# List all columns names
data.columns.tolist()

['equipe',
 'player_id',
 'player_name',
 'player_country_code',
 'age',
 'positions',
 'matches_played',
 'starts',
 'min',
 'gls',
 'ast',
 'gls_and_ast',
 'non_pen_gls',
 'xg',
 'non_pen_xg',
 'xag',
 'pk_made',
 'pk_att',
 'yellow_cards',
 'red_cards',
 'carries_prog',
 'passes_prog',
 'per90_gls',
 'per90_ast',
 'per90_non_pen_gls',
 'per90_xg',
 'per90_xag',
 'per90_non_pen_xg',
 'sh',
 'sot',
 'pct_sot',
 'per90_sh',
 'per90_sot',
 'gls_per_sh',
 'gls_per_sot',
 'avg_sh_dist',
 'fk_sh',
 'npxg_per_sh',
 'gls_xg_diff',
 'non_pen_gls_xg_diff',
 'pass_cmp',
 'pass_att',
 'pct_pass_cmp',
 'pass_ttl_dist',
 'pass_cmp_s',
 'pass_att_s',
 'pct_pass_cmp_s',
 'pass_cmp_m',
 'pass_att_m',
 'pct_pass_cmp_m',
 'pass_cmp_l',
 'pass_att_l',
 'pct_pass_cmp_l',
 'xa',
 'ast_xag_diff',
 'pass_prog',
 'pass_prog_ttl_dist',
 'key_passes',
 'pass_fthird',
 'pass_opp_box',
 'cross_opp_box',
 'pass_live',
 'pass_dead',
 'pass_fk',
 'through_balls',
 'switches',
 'crosses',
 'pass_offside',
 'pass_blo

## Verifications des colonnes et lignes vides

In [69]:
def search_col_almost_empty(dataframe, threshold=0.4):
    """
    Identify columns in the DataFrame that are almost empty based on a given threshold.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to analyze.
    threshold (float): The proportion of missing values to consider a column as almost empty.

    Returns:
    list: A list of column names that are almost empty.
    """
    almost_empty_cols = []
    total_rows = dataframe.shape[0]
    
    for col in dataframe.columns:
        missing_count = dataframe[col].isna().sum()
        if missing_count / total_rows >= threshold:
            almost_empty_cols.append(col)
    
    return almost_empty_cols

almost_empty_columns = search_col_almost_empty(data_clean)
print("Columns that are almost empty:", almost_empty_columns)

Columns that are almost empty: []


In [7]:
data_cleaned.head(655)

Unnamed: 0,equipe,player_id,player_name,player_country_code,age,positions,matches_played,starts,min,gls,...,fls_com,fls_drawn,offside,pk_won,pk_conceded,og,ball_recov,air_dual_won,air_dual_lost,pct_air_dual_won
0,Angers,c3e4ecbb,Jordan Lefort,FRA,30.0,DF,34,34,3060.0,0.0,...,14.0,15.0,1.0,0.0,1.0,0.0,110.0,40.0,28.0,58.8
1,Angers,9d420dad,Yahia Fofana,CIV,23.0,GK,33,33,2970.0,0.0,...,0.0,11.0,0.0,0.0,0.0,0.0,36.0,6.0,3.0,66.7
2,Angers,7978cbf0,Himad Abdelli,ALG,24.0,"MF,FW",32,32,2842.0,6.0,...,48.0,32.0,3.0,0.0,0.0,0.0,193.0,13.0,20.0,39.4
3,Angers,6bfb4303,Florent Hanin,ALG,34.0,DF,33,31,2672.0,1.0,...,16.0,35.0,1.0,0.0,0.0,0.0,132.0,15.0,28.0,34.9
4,Angers,48b28bfd,Jean-Eudes Aholou,CIV,30.0,MF,26,26,1959.0,2.0,...,50.0,25.0,1.0,0.0,0.0,0.0,100.0,38.0,23.0,62.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,Toulouse,92976f0d,Noah Edjouma,FRA,18.0,"MF,FW",10,1,171.0,2.0,...,4.0,3.0,0.0,0.0,0.0,0.0,8.0,2.0,3.0,40.0
656,Toulouse,bfb2056e,Logan Costa,CPV,23.0,DF,1,1,90.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,2.0,66.7
657,Toulouse,f240be08,Álex Domínguez,ESP,26.0,GK,1,0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
658,Toulouse,2a74ef47,Edhy Zuliani,FRA,19.0,DF,1,0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [70]:
def search_almost_empty_rows(dataframe, threshold=0.95):
    """
    Identify rows in the DataFrame that are almost empty based on a given threshold. Print the number of such rows.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to analyze.
    threshold (float): The proportion of missing values to consider a row as almost empty.

    Returns:
    pd.DataFrame: A DataFrame containing the almost empty rows.
    """
    almost_empty_rows = []
    total_cols = dataframe.shape[1]
    
    for index, row in dataframe.iterrows():
        missing_count = row.isna().sum()
        if missing_count / total_cols >= threshold:
            almost_empty_rows.append(index)
    print(f"Number of almost empty rows: {almost_empty_rows}")
    return dataframe.loc[almost_empty_rows]
    
almost_empty_rows = search_almost_empty_rows(data_clean)

Number of almost empty rows: []


## Traitement des NaN

In [72]:
# COMPTE DES COLONNES AVEC NaN

nan_by_col = data_clean.isna().sum().sort_values(ascending=False)
nan_by_col = nan_by_col[nan_by_col > 0]

print("📊 Colonnes contenant des NaN :")
display(nan_by_col.to_frame("nb_NaN"))

print(f"\n➡️ Nombre total de colonnes avec au moins un NaN : {len(nan_by_col)}")

# COMPTE DES JOUEURS AVEC NaN
nan_by_player = data_clean.isna().sum(axis=1)

# On crée une vue pour les joueurs qui ont au moins 1 NaN
players_with_nan = data_clean.loc[nan_by_player > 0, ["player_id", "player_name"]].copy()
players_with_nan["missing_count"] = nan_by_player[nan_by_player > 0]

print(f"\n👤 Nombre de joueurs avec au moins un NaN : {players_with_nan.shape[0]}")
display(players_with_nan.sort_values("missing_count", ascending=False).head(10))

# CONCLUSIONS : Gardiens souvent concernés. Dans tous les cas identifiés, signifie que pas de stat => A remplacer par 0 
data_clean = data_clean.fillna(0)

📊 Colonnes contenant des NaN :


Unnamed: 0,nb_NaN



➡️ Nombre total de colonnes avec au moins un NaN : 0

👤 Nombre de joueurs avec au moins un NaN : 0


Unnamed: 0,player_id,player_name,missing_count


In [74]:
colonnes_identifiantes = ['equipe', 'player_id', 'player_name', 'player_country_code', 'age', 'positions']
colonnes_ratios = ['pct_sot', 'gls_per_sh', 'gls_per_sot', 'avg_sh_dist', 'npxg_per_sh','pct_take_on_suc', 
                   'pct_take_on_tkld', 'pct_tkl_drb_suc', 'pct_air_dual_won']
colonnes_per90 = ['per90_gls', 'per90_ast', 'per90_non_pen_gls', 'per90_xg', 'per90_xag',
 'per90_non_pen_xg', 'per90_sh', 'per90_sot', 'per90_sca', 'per90_gca',
 'per90_plus_minus', 'per90_x_plus_minus', 'per90_on_off', 'per90_x_on_off']

data_clean_relativestats = data_clean[colonnes_identifiantes + colonnes_ratios + colonnes_per90].copy()

# Faire différentes approches, 1 avec la PCA, l'autre avec sélection méanuelle


        equipe player_id         player_name player_country_code   age  \
557      Saint  b49729cb  Gautier Larsonneur                 FRA  27.0   
0       Angers  c3e4ecbb       Jordan Lefort                 FRA  30.0   
261  Marseille  625c144a      Gerónimo Rulli                 ARG  32.0   
190      Lille  b3d76d84     Lucas Chevalier                 FRA  22.0   
407       Nice  db87a2c4        Marcin Bułka                 POL  24.0   

    positions  pct_sot  gls_per_sh  gls_per_sot  avg_sh_dist  ...  per90_xag  \
557        GK      0.0         0.0          0.0          0.0  ...       0.00   
0          DF     11.1         0.0          0.0         12.9  ...       0.03   
261        GK      0.0         0.0          0.0          0.0  ...       0.00   
190        GK      0.0         0.0          0.0          0.0  ...       0.03   
407        GK      0.0         0.0          0.0          0.0  ...       0.01   

     per90_non_pen_xg  per90_sh  per90_sot  per90_sca  per90_gca  \
557   

In [11]:
def drop_per_90_cols(data, exception_cols=None):
    """
    Drop columns that contain 'per90' in their names from the DataFrame.

    Parameters:
    data (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with 'per90' columns dropped.
    """
    if exception_cols is None:
        exception_cols = []
        
    cols_to_drop = [col for col in data.columns if 'per90' in col and col not in exception_cols]
    data_dropped = data.drop(columns=cols_to_drop)
    
    print(f"Dropped {len(cols_to_drop)} columns containing 'per90'.")
    return data_dropped

def manual_col_drop(data, cols_to_drop):
    """
    Manually drop specified columns from the DataFrame.

    Parameters:
    data (pd.DataFrame): The input DataFrame.
    cols_to_drop (list): List of column names to drop.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    data_dropped = data.drop(columns=cols_to_drop, errors='ignore')
    print(f"Dropped {len(cols_to_drop)} specified columns.")
    return data_dropped

In [13]:
exception_cols = ['per90_on_off', 'per90_x_on_off']
data_no_per90 = drop_per_90_cols(played_500, exception_cols=exception_cols)

manual_cols_to_drop = ['min_per_match_played', 'gls_and_ast', 'tkl_plus_int']
data_final = manual_col_drop(data_no_per90, manual_cols_to_drop)

data_final

print("Impression de nos données cleans : ")
df = pd.DataFrame(data_final)
df.to_csv("joueurs_ligue1_2024_2025_clean.csv", index=False, encoding="utf-8")
print("✅ Export CSV terminé.")

Dropped 12 columns containing 'per90'.
Dropped 3 specified columns.
Impression de nos données cleans : 
✅ Export CSV terminé.
