In [1]:
# Sleep Health Prediction - Clean + Explanations (Colab-ready)
# -----------------------------------------------------

# 0. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

In [2]:


# 1. Config / Path
DATA_PATH = "D:\\PythonFlask\\notebook\\Sleep_health_and_lifestyle_dataset.csv"
ARTIFACT_DIR = "D:\\PythonFlask\\artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)


In [3]:
# 2. Load data (very small print)
df = pd.read_csv(DATA_PATH)
df = df.drop(columns=['Person ID'], errors='ignore')   # drop jika ada

print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist())

Loaded rows: 414
Columns: ['Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps', 'Sleep Disorder']


In [4]:
# 3. Quick cleaning helper: split Blood Pressure if present
if 'Blood Pressure' in df.columns:
    # safe-split, fill invalid with NaN
    bp_split = df['Blood Pressure'].astype(str).str.split('/', expand=True)
    if bp_split.shape[1] >= 2:
        df['Systolic_BP'] = pd.to_numeric(bp_split[0], errors='coerce')
        df['Diastolic_BP'] = pd.to_numeric(bp_split[1], errors='coerce')
    else:
        df['Systolic_BP'] = np.nan
        df['Diastolic_BP'] = np.nan
    df = df.drop(columns=['Blood Pressure'], errors='ignore')



In [5]:
# 4. Prepare target & features (target-first!)
TARGET_COL = 'Sleep Disorder'
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataset")

# y = df[TARGET_COL].astype(str)   # keep string labels for interpretability
df[TARGET_COL] = df[TARGET_COL].fillna("No Sleep Disorder") # change nan to No Sleep Disorder
y = df[TARGET_COL]

X_raw = df.drop(columns=[TARGET_COL])

In [6]:
# 5. Basic imputation (numeric) and categorical fill
num_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_raw.select_dtypes(include=['object', 'category']).columns.tolist()

# Imputers (fit on training data later, but we can fit on whole dataset here for simplicity)
num_imputer = SimpleImputer(strategy='median')
X_raw[num_cols] = num_imputer.fit_transform(X_raw[num_cols])

# For categoricals: use most frequent
for c in cat_cols:
    X_raw[c] = X_raw[c].fillna(X_raw[c].mode().iloc[0])


In [7]:
# 6. (Optional) Basic feature engineering — keep light and deterministic
# Example: Sleep_Deficit
if 'Sleep Duration' in X_raw.columns:
    X_raw['Sleep_Deficit'] = 8.0 - X_raw['Sleep Duration']



In [8]:
# 7. Encode categorical features with one-hot (create feature matrix)
X_encoded = pd.get_dummies(X_raw, drop_first=True)

In [9]:
# 8. Label encode target (for modeling), and save label encoder
le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)   # 0..n-1 labels



In [10]:
# 9. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

In [11]:
# 10. Scaling numeric features
scaler = StandardScaler()
# scale all columns (works because X_encoded is numeric after get_dummies)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# 11. Train model (RandomForest like original)
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  # keep balanced handling
)
model.fit(X_train_scaled, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
# 12. Evaluate (minimal prints)
y_pred = model.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred, target_names=le_target.classes_, digits=4))

Test Accuracy: 0.9036

Classification report:
                   precision    recall  f1-score   support

         Insomnia     0.9048    0.9048    0.9048        21
No Sleep Disorder     0.9535    0.9318    0.9425        44
      Sleep Apnea     0.7895    0.8333    0.8108        18

         accuracy                         0.9036        83
        macro avg     0.8826    0.8900    0.8860        83
     weighted avg     0.9056    0.9036    0.9044        83



In [14]:
# 13. Save artifacts for API
# - model
# - scaler
# - label encoder
# - training feature columns (order)
joblib.dump(model, os.path.join(ARTIFACT_DIR, "sleep_model.pkl"))
joblib.dump(scaler, os.path.join(ARTIFACT_DIR, "scaler.pkl"))
joblib.dump(le_target, os.path.join(ARTIFACT_DIR, "label_encoder.pkl"))

feature_columns = X_train.columns.tolist()  # columns order used for training (before scaling)
joblib.dump(feature_columns, os.path.join(ARTIFACT_DIR, "model_features.pkl"))

['D:\\PythonFlask\\artifacts\\model_features.pkl']

In [15]:
# 14. Minimal function to preprocess a single input and predict (reproducible)
def preprocess_single(input_dict, feature_columns, num_imputer_obj, scaler_obj):
    """
    input_dict: raw input keys same as original dataset columns (before encoding)
    feature_columns: list of columns after get_dummies during training
    Returns: scaled numpy array ready for model.predict
    """
    df_in = pd.DataFrame([input_dict])

    # Handle Blood Pressure like training step
    if 'Blood Pressure' in df_in.columns:
        bp_split = df_in['Blood Pressure'].astype(str).str.split('/', expand=True)
        if bp_split.shape[1] >= 2:
            df_in['Systolic_BP'] = pd.to_numeric(bp_split[0], errors='coerce')
            df_in['Diastolic_BP'] = pd.to_numeric(bp_split[1], errors='coerce')
        df_in = df_in.drop(columns=['Blood Pressure'], errors='ignore')

    # Fill missing numeric columns with median from training (we used num_imputer on whole dataset)
    # To be robust, ensure all numeric cols exist
    # Find numeric cols we used earlier
    # We'll try using the num_imputer fitted earlier (num_imputer)
    # For safety, if a numeric column missing in input, set to median=0
    # But here we'll just keep consistent: reindex columns to match original raw X_raw columns before get_dummies
    # Easiest approach: build row with get_dummies then reindex to feature_columns

    # Fill categorical NaNs with mode fallback (empty string)
    for c in df_in.select_dtypes(include=['object', 'category']).columns:
        if df_in[c].isna().any():
            df_in[c] = df_in[c].fillna('')

    # One-hot encode
    df_enc = pd.get_dummies(df_in, drop_first=True)

    # Reindex to match training feature columns
    df_enc = df_enc.reindex(columns=feature_columns, fill_value=0)

    # Numeric imputation if needed (apply num_imputer to numeric positions)
    # Find numeric column indices in feature_columns by checking if col exists in df_in numeric set
    # Simpler: apply scaler directly. But scaler expects same number of columns as training scaled features.
    # We'll just ensure df_enc is numeric (it is), then scale
    arr_scaled = scaler_obj.transform(df_enc)
    return arr_scaled


In [16]:
print("Target classes:", le_target.classes_)


Target classes: ['Insomnia' 'No Sleep Disorder' 'Sleep Apnea']


In [17]:
# 15. Quick manual test using saved artifacts (in-notebook)
# Load back for demonstration
_model = joblib.load(os.path.join(ARTIFACT_DIR, "sleep_model.pkl"))
_scaler = joblib.load(os.path.join(ARTIFACT_DIR, "scaler.pkl"))
_le = joblib.load(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"))
_feat_cols = joblib.load(os.path.join(ARTIFACT_DIR, "model_features.pkl"))

# Example input (must use original dataset column names)
sample_input = {
    "Gender": "Male",
    "Age": 30,
    "Occupation": "Software Engineer",
    "Sleep Duration": 6.5,
    "Quality of Sleep": 6,
    "Physical Activity Level": 40,
    "Stress Level": 7,
    "BMI Category": "Normal",
    "Blood Pressure": "120/80",
    "Heart Rate": 75,
    "Daily Steps": 6000
}

X_sample = preprocess_single(sample_input, _feat_cols, num_imputer, _scaler)
pred_label_encoded = _model.predict(X_sample)[0]
pred_label = _le.inverse_transform([pred_label_encoded])[0]

print("Sample prediction:", pred_label)

# Save artefacts also to current dir for download if needed
joblib.dump(_model, "sleep_model.pkl")
joblib.dump(_scaler, "scaler.pkl")
joblib.dump(_le, "label_encoder.pkl")
joblib.dump(_feat_cols, "model_features.pkl")

print("\nArtifacts saved to", ARTIFACT_DIR, "and local dir (sleep_model.pkl etc).")

Sample prediction: No Sleep Disorder

Artifacts saved to D:\PythonFlask\artifacts and local dir (sleep_model.pkl etc).
