# TD 3: Focus on biases
In this practical class, we will investigate if there are some biases in the data and/or in the model trained on these data, and try to debiase the features. We will focus on the SVM (C = 1, kernel = rbf) classifier you trained in the first practical class on read speech with the eGeMAPS features.\
\
BEFORE DOING ANYTHING ELSE\
**Task n°1. Install the pingouin and statsmodels Python package.**

Statistical tests in Python\
All the necessary statistical tests can be run directly from the pingouin package, given that your data is already clean and structured on the correct Pandas dataframe.\
Using pingouin:
- χ2 [doc]
- Mann-Whitney’s U [doc]

Using import statsmodels.formula.api as smf:
- Logistic regression: smf.logit("a ∼ b * c * d", data=data).fit()
- Linear regression: smf.ols("a ∼ b * c * d", data=data).fit()

## 1 Biases in data
1.1 Univariate bias (*)
 Task n°2. Describe the sex, age, and level of education depending on the diagnosis.
 Task n°3. Apply statistical tests to identify biases in the data (i.e. differences between the two ‘depressed
 patient’ and ‘healthy control’ classes).
 Pick the relevant statistical tests: χ2 for sex, Mann-Whitney’s U for age, χ2 for level of education
 (cf. box at the end of this file for more details).

In [None]:
import pandas as pd
import re
from scipy.stats import chi2_contingency, mannwhitneyu


def parse_filename(filename):
    """
    Parses a filename to extract subject features using regular expressions.
    This version also strips leading/trailing whitespace from the filename.

    The filename format is expected to be:
    {id}_{diagnosis}{sex}{age}_{education}
    e.g., '01_CF56_1' or '01_CF56_01'

    Args:
        filename (str): The filename string.

    Returns:
        pd.Series: A pandas Series containing the extracted features,
                   or an empty Series if the format is incorrect.
    """
    # Clean the filename by converting to string and stripping whitespace
    clean_filename = str(filename).strip()

    # Regex to capture the different parts of the filename
    # (\d+) - captures the ID
    # ([CP]) - captures Diagnosis (C or P)
    # ([FM]) - captures Sex (F or M)
    # (\d{1,2}) - captures Age (1 or 2 digits)
    # (\d+) - captures Education level (one or more digits)
    match = re.match(r'(\d+)_([CP])([FM])(\d{1,2})_(\d+)', clean_filename)

    if match:
        parts = match.groups()
        # Map the captured groups to meaningful names and return as a Series
        return pd.Series({
            'id': int(parts[0]),
            'diagnosis': 'Patient' if parts[1] == 'P' else 'Control',
            'sex': 'Female' if parts[2] == 'F' else 'Male',
            'age': int(parts[3]),
            'education_level': int(parts[4])
        })
    # Return an empty series if no match is found
    return pd.Series(dtype='object')

# Define the full path to your CSV file
file_path = 'C:/Users/Raver/PycharmProjects/Speech processing/ressources/feautres_android_means/spontaneous_means/directory_means.csv'

try:
    print(f"--- Reading and parsing the last column from '{file_path}' ---")

    # Read the entire CSV file without assuming a header
    df = pd.read_csv(file_path, header=None)

    # Select only the very last column
    directory_column = df.iloc[:, -1]

    # Apply the parsing function to the directory column
    parsed_data = directory_column.apply(parse_filename)

    # Drop any rows where parsing failed (which will be all NaN)
    parsed_data.dropna(how='all', inplace=True)


    if not parsed_data.empty:
        print("\nSuccessfully parsed the data. Now performing analysis...")
        print("="*50)

        # --- Descriptive Analysis Section ---
        # Get the unique diagnosis groups to loop through
        unique_diagnoses = parsed_data['diagnosis'].unique()

        for group_name in unique_diagnoses:
            print(f"\n--- Analysis for '{group_name}' Group ---")

            # Filter the DataFrame for the current group
            group_df = parsed_data[parsed_data['diagnosis'] == group_name]

            # 1. Describe Age and Education Level
            print("\n-- Age and Education Statistics --")
            # Using .describe() and selecting specific rows to avoid errors and quantiles
            stats = group_df[['age', 'education_level']].describe().loc[['count', 'mean', 'std', 'min', 'max']]
            print(stats)

            # 2. Describe Sex Distribution
            print("\n-- Sex Distribution --")
            sex_distribution = group_df['sex'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%'
            print(sex_distribution)
            print("\n" + "="*50)

     # --- Statistical Bias Analysis Section ---
        print("\n--- Statistical Bias Analysis (Patient vs. Control) ---")

        # Ensure there are two groups to compare
        if len(unique_diagnoses) == 2:
            patient_group = parsed_data[parsed_data['diagnosis'] == 'Patient']
            control_group = parsed_data[parsed_data['diagnosis'] == 'Control']

            # 1. Chi-squared (χ2) test for Sex
            print("\n1. Chi-squared test for Sex:")
            contingency_sex = pd.crosstab(parsed_data['diagnosis'], parsed_data['sex'])
            chi2, p, dof, expected = chi2_contingency(contingency_sex)
            print(f"   - Chi2 statistic: {chi2:.4f}")
            print(f"   - p-value: {p:.4f}")
            if p < 0.05:
                print("   - Result: Significant difference in sex distribution between groups.")
            else:
                print("   - Result: No significant difference in sex distribution.")

            # 2. Mann-Whitney's U test for Age
            print("\n2. Mann-Whitney U test for Age:")
            u_stat, p_val = mannwhitneyu(patient_group['age'], control_group['age'], alternative='two-sided')
            print(f"   - U statistic: {u_stat:.4f}")
            print(f"   - p-value: {p_val:.4f}")
            if p_val < 0.05:
                print("   - Result: Significant difference in age between groups.")
            else:
                print("   - Result: No significant difference in age.")

            # 3. Chi-squared (χ2) test for Education Level
            print("\n3. Chi-squared test for Education Level:")
            contingency_edu = pd.crosstab(parsed_data['diagnosis'], parsed_data['education_level'])
            chi2_edu, p_edu, dof_edu, expected_edu = chi2_contingency(contingency_edu)
            print(f"   - Chi2 statistic: {chi2_edu:.4f}")
            print(f"   - p-value: {p_edu:.4f}")
            if p_edu < 0.05:
                print("   - Result: Significant difference in education level distribution.")
            else:
                print("   - Result: No significant difference in education level distribution.")

        else:
            print("\nCould not perform statistical tests: requires exactly two diagnosis groups ('Patient' and 'Control').")

    else:
        print("\nWarning: Could not parse any entries from the 'directory' column.")
        print("Please check that the data format matches the expected pattern (e.g., '01_CF56_1').")


except FileNotFoundError:
    print(f"Error: The file was not found at the specified path: {file_path}")
    print("Please make sure the file path is correct.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or has no columns to read.")
except Exception as e:
    print(f"An error occurred: {e}")