In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [2]:
df = pd.read_csv("../data/Sleep_health_and_lifestyle_dataset.csv")
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [10]:
df["sleep_quality_bin"] = (df["Quality of Sleep"] >= 7).astype(int)
df["sleep_quality_bin"].value_counts()

sleep_quality_bin
1    257
0    117
Name: count, dtype: int64

In [11]:
df["Sleep Disorder"] = df["Sleep Disorder"].fillna('Healthy')


In [12]:
target = "sleep_quality_bin"

numeric_features = [
    "Age",
    "Sleep Duration",
    "Physical Activity Level",
    "Stress Level",
    "Heart Rate",
    "Daily Steps",
]

categorical_features = [
    "Gender",
    "Occupation",
    "BMI Category",
    "Sleep Disorder"
]

X = df[numeric_features + categorical_features]
y = df[target]

In [5]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [7]:
joblib.dump(preprocessor, "../artifacts/preprocessor.joblib")
joblib.dump((X_train, X_test, y_train, y_test), "../artifacts/splits.joblib")

['../artifacts/splits.joblib']