# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
def load_data(file_path):
    """
    Load data from a CSV file into a pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded data as a pandas DataFrame.
    """
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully from {file_path}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    
file_path = 'joueurs_ligue1_2024_2025.csv'
data = load_data(file_path)

data.head()

In [None]:
col_minutes = 'min'
data[col_minutes] = pd.to_numeric(data[col_minutes], errors='coerce')
data_cleaned = data.dropna(subset=[col_minutes])

played_500 = data_cleaned[data_cleaned[col_minutes] > 500]

print(f"Number of players with more than 500 minutes played: {played_500.shape[0]}")

In [None]:
# List all columns names
data.columns.tolist()

In [None]:
def search_col_almost_empty(dataframe, threshold=0.4):
    """
    Identify columns in the DataFrame that are almost empty based on a given threshold.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to analyze.
    threshold (float): The proportion of missing values to consider a column as almost empty.

    Returns:
    list: A list of column names that are almost empty.
    """
    almost_empty_cols = []
    total_rows = dataframe.shape[0]
    
    for col in dataframe.columns:
        missing_count = dataframe[col].isna().sum()
        if missing_count / total_rows >= threshold:
            almost_empty_cols.append(col)
    
    return almost_empty_cols

almost_empty_columns = search_col_almost_empty(data_cleaned)
print("Columns that are almost empty:", almost_empty_columns)

In [None]:
data_cleaned.head(655)

In [None]:
def search_almost_empty_rows(dataframe, threshold=0.95):
    """
    Identify rows in the DataFrame that are almost empty based on a given threshold. Print the number of such rows.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to analyze.
    threshold (float): The proportion of missing values to consider a row as almost empty.

    Returns:
    pd.DataFrame: A DataFrame containing the almost empty rows.
    """
    almost_empty_rows = []
    total_cols = dataframe.shape[1]
    
    for index, row in dataframe.iterrows():
        missing_count = row.isna().sum()
        if missing_count / total_cols >= threshold:
            almost_empty_rows.append(index)
    print(f"Number of almost empty rows: {almost_empty_rows}")
    return dataframe.loc[almost_empty_rows]
almost_empty_rows = search_almost_empty_rows(data_cleaned)

In [None]:
def drop_per_90_cols(data, exception_cols=None):
    """
    Drop columns that contain 'per90' in their names from the DataFrame.

    Parameters:
    data (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with 'per90' columns dropped.
    """
    if exception_cols is None:
        exception_cols = []
        
    cols_to_drop = [col for col in data.columns if 'per90' in col and col not in exception_cols]
    data_dropped = data.drop(columns=cols_to_drop)
    
    print(f"Dropped {len(cols_to_drop)} columns containing 'per90'.")
    return data_dropped

def manual_col_drop(data, cols_to_drop):
    """
    Manually drop specified columns from the DataFrame.

    Parameters:
    data (pd.DataFrame): The input DataFrame.
    cols_to_drop (list): List of column names to drop.

    Returns:
    pd.DataFrame: The DataFrame with specified columns dropped.
    """
    data_dropped = data.drop(columns=cols_to_drop, errors='ignore')
    print(f"Dropped {len(cols_to_drop)} specified columns.")
    return data_dropped

In [None]:
exception_cols = ['per90_on_off', 'per90_x_on_off']
data_no_per90 = drop_per_90_cols(played_500, exception_cols=exception_cols)

manual_cols_to_drop = ['min_per_match_played', 'gls_and_ast', 'tkl_plus_int']
data_final = manual_col_drop(data_no_per90, manual_cols_to_drop)

data_final

In [None]:
# Get string col names
def get_string_columns(dataframe):
    """
    Get a list of column names in the DataFrame that have string (object) data type.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame.

    Returns:
    list: A list of column names with string data type.
    """
    string_cols = dataframe.select_dtypes(include=['object']).columns.tolist()
    return string_cols
string_columns = get_string_columns(data_final)
print("String columns:", string_columns) 

In [None]:
def convert_string_to_numeric(dataframe, string_cols):
    pass

In [None]:
# # plot correlation matrix
# pearson_corr = data_final.corr(method='pearson')
# plt.figure(figsize=(12, 10))
# sns.heatmap(pearson_corr, annot=True, cmap='coolwarm')
# plt.title('Pearson Correlation Matrix')
# plt.show()
