In [1]:
import pandas as pd
df = pd.read_csv("nhanes_ready.csv")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6889 entries, 0 to 6888
Data columns (total 38 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SEQN                        6889 non-null   float64
 1   RIDAGEYR                    6889 non-null   float64
 2   RIAGENDR                    6889 non-null   int64  
 3   LBXTC                       6889 non-null   float64
 4   BMXHT                       6889 non-null   float64
 5   BMXBMI                      6889 non-null   float64
 6   BMXWT                       6889 non-null   float64
 7   BMXWAIST                    6889 non-null   float64
 8   BMXARMC                     6889 non-null   float64
 9   PAD680                      6889 non-null   float64
 10  SLD012                      6889 non-null   float64
 11  DRKCAL                      6889 non-null   float64
 12  DRPROT                      6889 non-null   float64
 13  DRSUGR                      6889 

In [4]:
# Colonnes binaires 0/1
binary_cols = ["smoker", "diabetes", "chol_high", "RIAGENDR"]

# Colonnes booléennes (ethnicité)
bool_cols = [col for col in df.columns if df[col].dtype == "bool"]

# Colonnes log-transformed (on évite de faire une détection d'outliers dessus)
log_cols = [col for col in df.columns if col.endswith("_log")]

# Colonnes numériques continues candidates
num_cols = df.select_dtypes(include=["float64", "int64"]).columns

# On retire les binaires, les booléens, les logs
num_cols_clean = [
    col for col in num_cols
    if col not in binary_cols
    and col not in bool_cols
    and col not in log_cols
]

num_cols_clean


['SEQN',
 'RIDAGEYR',
 'LBXTC',
 'BMXHT',
 'BMXBMI',
 'BMXWT',
 'BMXWAIST',
 'BMXARMC',
 'PAD680',
 'SLD012',
 'DRKCAL',
 'DRPROT',
 'DRSUGR',
 'DRFIBE',
 'DRTFAT',
 'DRTALCO',
 'DRCARB',
 'DRWATER']

In [5]:
outlier_counts = {}

for col in num_cols_clean:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_extreme = Q1 - 3 * IQR
    upper_extreme = Q3 + 3 * IQR
    
    n_extreme = df[(df[col] < lower_extreme) | (df[col] > upper_extreme)].shape[0]
    
    outlier_counts[col] = n_extreme

df_outliers = pd.DataFrame.from_dict(outlier_counts, orient="index", columns=["extreme_outliers"])
df_outliers.sort_values("extreme_outliers", ascending=False)


Unnamed: 0,extreme_outliers
DRTALCO,1216
DRSUGR,115
DRWATER,110
DRFIBE,108
DRPROT,102
DRCARB,81
DRTFAT,79
DRKCAL,74
SLD012,44
PAD680,36
