In [3]:
import pandas as pd
from src.config import RAW_DATA

In [4]:
df = pd.read_csv(RAW_DATA)

# Split Data

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x = df.drop('charges',axis=1)
y = df['charges']

X_train , X_test , Y_train , Y_test = train_test_split(x,y,test_size=0.2,random_state=42)


# Encoding & Scalling

In [7]:
# bmi is only the normal distribution we have

num_normal = ["bmi"]
num_not_normal = ["age", "children"]

# categorical columns

categorical_nominal = ["region"] # to apply onehot encoding
categorical_binary = ["sex", "smoker"] # just mapping


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

# Map categorical_binary first
binary_mapping = {
    "sex": {"male": 1, "female": 0},
    "smoker": {"yes": 1, "no": 0},
}

for col, mapping in binary_mapping.items():
    X_train[col] = X_train[col].map(mapping)
    X_test[col] = X_test[col].map(mapping)

# Preprocessor (only for scaling + onehot)
preprocessor = ColumnTransformer(
    transformers=[
        ("std", StandardScaler(), num_normal),
        ("minmax", MinMaxScaler(), num_not_normal),
        ("onehot", OneHotEncoder(drop="first"), categorical_nominal),
    ],
    remainder="passthrough"  # keep sex/smoker (already 0/1)
)


# Fit Data

In [9]:

# Apply transformations
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert back to DataFrame with column names
onehot_columns = preprocessor.named_transformers_["onehot"].get_feature_names_out(categorical_nominal)
final_columns = num_normal + num_not_normal + list(onehot_columns) + categorical_binary

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=final_columns)
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns=final_columns)


In [10]:
import joblib
from src.config import MODEL_DIR

joblib.dump(preprocessor,MODEL_DIR/"transformer.pkl")

['/home/abdelrahman/hdd/void/GitHub/My-Projects/MCP/src/model/transformer.pkl']

# Save Data

In [11]:
from src.config import PROC_DATA_DIR

X_train_preprocessed.to_csv(PROC_DATA_DIR / "X_train_preprocessed.csv", index=False)
X_test_preprocessed.to_csv(PROC_DATA_DIR / "X_test_preprocessed.csv", index=False)
Y_train.to_csv(PROC_DATA_DIR / "y_train.csv", index=False)
Y_test.to_csv(PROC_DATA_DIR / "y_test.csv", index=False)
