### Handle missing values (median/mode imputation)

In [43]:
import pandas as pd

df = pd.read_csv("../data/raw/assurance-maladie.csv")

df.isnull().values.any()

np.False_

In [41]:

median_value = df['age'].median()
df['age'].fillna(median_value, inplace=True)


mean_value = df['bmi'].mean()
df['bmi'].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(mean_value, inplace=True)


### Remove duplicates

In [44]:
#show the duplicated rows

duplicate_rows = df.duplicated()

df[duplicate_rows]

#delet the duble, in the original file 

df.drop_duplicates(inplace=True)

### Handle outliers (IQR, z-score, boxplot)

In [52]:

Q1 = df['charges'].quantile(0.25)

Q3 = df['charges'].quantile(0.75)

IQR = Q3 - Q1

upper_bound = Q3 + 1.5 * IQR


df_cleaned = df[df['charges'] <= upper_bound]
print(df_cleaned)

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1198 rows x 7 columns]


### save the clean copie

In [46]:
file_path = '../data/processed/cleaned_data.csv' 

df_cleaned.to_csv(
    file_path, 
    index=False, 
    encoding='utf-8' 
)

print(f"DataFrame successfully saved to {file_path}")

DataFrame successfully saved to ../data/processed/cleaned_data.csv


### Encode categorical variables

In [55]:
cols_to_encode = ['sex', 'smoker', 'region']

df_encoded = pd.get_dummies(df_cleaned, columns=cols_to_encode, drop_first=True)

print(df_encoded)

file_path = '../data/processed/cleaned_data.csv' 

df_encoded.to_csv(
    file_path, 
    index=False, 
    encoding='utf-8' 
)

      age     bmi  children      charges  sex_male  smoker_yes  \
0      19  27.900         0  16884.92400     False        True   
1      18  33.770         1   1725.55230      True       False   
2      28  33.000         3   4449.46200      True       False   
3      33  22.705         0  21984.47061      True       False   
4      32  28.880         0   3866.85520      True       False   
...   ...     ...       ...          ...       ...         ...   
1333   50  30.970         3  10600.54830      True       False   
1334   18  31.920         0   2205.98080     False       False   
1335   18  36.850         0   1629.83350     False       False   
1336   21  25.800         0   2007.94500     False       False   
1337   61  29.070         0  29141.36030     False        True   

      region_northwest  region_southeast  region_southwest  
0                False             False              True  
1                False              True             False  
2                False  

### Train/test split (80/20)

In [None]:
from sklearn.model_selection import train_test_split

# Features (all columns except 'charges')
X = df_encoded.drop(columns=['charges'])

# Target (just the 'charges' column)
y = df_encoded['charges']

# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=9
)

print(X_test)
print(X_train)
print(y_test)
print(y_train)

      age     bmi  children  sex_male  smoker_yes  region_northwest  \
1045   43  24.700         2     False        True              True   
764    45  25.175         2     False       False             False   
804    23  26.510         0      True       False             False   
152    32  37.145         3     False       False             False   
498    44  23.980         2     False       False             False   
...   ...     ...       ...       ...         ...               ...   
258    51  24.415         4      True       False              True   
1255   42  37.900         0     False       False             False   
347    46  33.345         1      True       False             False   
1114   23  24.510         0      True       False             False   
1153   35  35.815         1     False       False              True   

      region_southeast  region_southwest  
1045             False             False  
764              False             False  
804               

### Normalize/Standardize numeric features

In [58]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

print(X_test)

      age     bmi  children  sex_male  smoker_yes  region_northwest  \
1045   43  24.700         2     False        True              True   
764    45  25.175         2     False       False             False   
804    23  26.510         0      True       False             False   
152    32  37.145         3     False       False             False   
498    44  23.980         2     False       False             False   
...   ...     ...       ...       ...         ...               ...   
258    51  24.415         4      True       False              True   
1255   42  37.900         0     False       False             False   
347    46  33.345         1      True       False             False   
1114   23  24.510         0      True       False             False   
1153   35  35.815         1     False       False              True   

      region_southeast  region_southwest  
1045             False             False  
764              False             False  
804               