In [7]:
data = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib

# Load dataset
data = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')

# 1. Mengonversi kolom 'Blood Pressure' menjadi dua kolom terpisah
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True)

# 2. Ubah tipe data kolom 'Systolic' dan 'Diastolic' menjadi numerik
data['Systolic'] = pd.to_numeric(data['Systolic'], errors='coerce')
data['Diastolic'] = pd.to_numeric(data['Diastolic'], errors='coerce')

# 3. Drop kolom 'Blood Pressure' yang sudah tidak diperlukan lagi
data = data.drop(columns=['Blood Pressure'])

# Encoding categorical features
label_encoders = {}
categorical_columns = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))  # Encoding string ke numerik
    label_encoders[column] = le

# Selecting features and target
X = data.drop(['Person ID', 'Stress Level'], axis=1)
y = data['Stress Level']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(random_state=42))
])

# Training the model
pipeline.fit(X_train, y_train)

# Model performance on training data
train_predictions = pipeline.predict(X_train)
train_r2 = metrics.r2_score(y_train, train_predictions)
train_mse = metrics.mean_squared_error(y_train, train_predictions)

# Model performance on test data
test_predictions = pipeline.predict(X_test)
test_r2 = metrics.r2_score(y_test, test_predictions)
test_mse = metrics.mean_squared_error(y_test, test_predictions)

# Printing performance results
print("Training R^2:", train_r2)
print("Training MSE:", train_mse)
print("Test R^2:", test_r2)
print("Test MSE:", test_mse)

# Saving the pipeline
joblib.dump(pipeline, 'stress_predictor_pipeline.pkl')

# Saving label encoders for categorical columns
joblib.dump(label_encoders, 'label_encoders.pkl')

print("Model and encoders saved successfully!")


Training R^2: 0.9963814015572858
Training MSE: 0.01128026755852843
Test R^2: 0.9901468077842267
Test MSE: 0.03078400000000002
Model and encoders saved successfully!
