### Handle missing values (median/mode imputation)

In [None]:
import pandas as pd

df = pd.read_csv("../data/raw/assurance-maladie.csv")

df.isnull().values.any()

In [None]:

median_value = df['age'].median()
df['age'].fillna(median_value, inplace=True)


mean_value = df['bmi'].mean()
df['bmi'].fillna(mean_value, inplace=True)

### Remove duplicates

In [91]:
#show the duplicated rows

duplicate_rows = df.duplicated()

df[duplicate_rows]

#delet the duble, in the original file 

df.drop_duplicates(inplace=True)

### Handle outliers (IQR, z-score, boxplot)

In [None]:
Q1 = df['charges'].quantile(0.25)
Q3 = df['charges'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df

print(df_cleaned)


### save the clean copie

In [None]:
file_path = '../data/processed/cleaned_data.csv' 

df_cleaned.to_csv(
    file_path, 
    index=False, 
    encoding='utf-8' 
)

print(f"DataFrame successfully saved to {file_path}")

### Encode categorical variables

In [None]:
cols_to_encode = ['sex', 'smoker', 'region']

df_encoded = pd.get_dummies(df_cleaned, columns=cols_to_encode, drop_first=True)

print(df_encoded)

file_path = '../data/processed/cleaned_data.csv' 

df_encoded.to_csv(
    file_path, 
    index=False, 
    encoding='utf-8' 
)


### Train/test split (80/20)

In [None]:
from sklearn.model_selection import train_test_split

# Features (all columns except 'charges')
X = df_encoded.drop(columns=['charges'])

# Target (just the 'charges' column)
y = df_encoded['charges']

# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=9
)

print(X_test)
print(X_train)
print(y_test)
print(y_train)

### Normalize/Standardize numeric features

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

print(X_test)