In [14]:
import pandas as pd
import numpy as np

# Création d'un exemple de DataFrame
data = {
    'age': [25, 30, 35, 40, 45, 50, 55, 60, 65, 180, 75, np.nan, 85],
    'total_pages_visited': [2, 3, 1042, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
}
df = pd.DataFrame(data)


def remove_outliers_IQR(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[((df[column] >= lower_bound) & (df[column] <= upper_bound)) | df[column].isna()]
    return df

def remove_outliers_sigma(df, column):
    mean_col = df[column].mean()
    sigma_col = df[column].std()

    lower_bound = mean_col - 3 * sigma_col
    upper_bound = mean_col + 3 * sigma_col
    df = df[((df[column] >= lower_bound) & (df[column] <= upper_bound)) | df[column].isna()]
    return df

# method = "IQR" or "sigma"
# TODO : proposer une option clip 
# df[col] = df[col].clip(lower=df[col].quantile(0.05), upper=df[col].quantile(0.95))
def remove_outliers(df, column, method="sigma"):
    match method:
        case "IQR":
            df = remove_outliers_IQR(df, column)
        case "sigma":
            df = remove_outliers_sigma(df, column)
        case _:
            raise Exception("method must be IQR or sigma")    
    
    return df
    
    
print(df)
df = remove_outliers(df, 'age', "sigma")
df = remove_outliers(df, 'total_pages_visited', "sigma")
print(df)






      age  total_pages_visited
0    25.0                    2
1    30.0                    3
2    35.0                 1042
3    40.0                    5
4    45.0                    6
5    50.0                    7
6    55.0                    8
7    60.0                    9
8    65.0                   10
9   180.0                   11
10   75.0                   12
11    NaN                   13
12   85.0                   14
      age  total_pages_visited
0    25.0                    2
1    30.0                    3
3    40.0                    5
4    45.0                    6
5    50.0                    7
6    55.0                    8
7    60.0                    9
8    65.0                   10
9   180.0                   11
10   75.0                   12
11    NaN                   13
12   85.0                   14


IQR
age  total_pages_visited
0    25.0                    2
1    30.0                    3
2    35.0                 1042
3    40.0                    5
4    45.0                    6
5    50.0                    7
6    55.0                    8
7    60.0                    9
8    65.0                   10
9   180.0                   11
10   75.0                   12
11    NaN                   13
12   85.0                   14
     age  total_pages_visited
0   25.0                    2
1   30.0                    3
3   40.0                    5
4   45.0                    6
5   50.0                    7
6   55.0                    8
7   60.0                    9
8   65.0                   10
10  75.0                   12
11   NaN                   13
12  85.0                   14


Sigma
age  total_pages_visited
0    25.0                    2
1    30.0                    3
2    35.0                 1042
3    40.0                    5
4    45.0                    6
5    50.0                    7
6    55.0                    8
7    60.0                    9
8    65.0                   10
9   180.0                   11
10   75.0                   12
11    NaN                   13
12   85.0                   14
      age  total_pages_visited
0    25.0                    2
1    30.0                    3
3    40.0                    5
4    45.0                    6
5    50.0                    7
6    55.0                    8
7    60.0                    9
8    65.0                   10
9   180.0                   11
10   75.0                   12
11    NaN                   13
12   85.0                   14