# Create model

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv('./data/Sleep_Efficiency.csv')  # Replace with actual path

In [3]:
# Drop excluded attributes
excluded_attributes = [
    "Alcohol consumption", "Awakenings", "Caffeine consumption", 
    "ID", "Wakeup time", "Bedtime"
]
data = data.drop(columns=excluded_attributes)

In [11]:
data = pd.get_dummies(data, drop_first=True)
data

Unnamed: 0,Age,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Exercise frequency,Gender_Female,Gender_Male,Smoking status_No,Smoking status_Yes
0,65,6.0,0.88,18,70,12,3.0,True,False,False,True
1,69,7.0,0.66,19,28,53,3.0,False,True,False,True
2,40,8.0,0.89,20,70,10,3.0,True,False,True,False
3,40,6.0,0.51,23,25,52,1.0,True,False,False,True
4,57,8.0,0.76,27,55,18,3.0,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...
447,27,7.5,0.91,22,57,21,5.0,True,False,True,False
448,52,6.0,0.74,28,57,15,3.0,False,True,True,False
449,40,8.5,0.55,20,32,48,0.0,True,False,False,True
450,45,7.0,0.76,18,72,10,3.0,False,True,True,False


In [12]:
# Set 'Sleep efficiency' as the target variable
target = 'Sleep efficiency'
X = data.drop(columns=[target])
y = data[target]

In [13]:
# Split the data into training and testing sets
# You can mimic RapidMiner behavior of using full data by setting test_size=0, but generally use some split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2001)

In [14]:
# Train the Random Forest model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=4,
    min_samples_leaf=2,
    criterion='squared_error',  # 'least_square' equivalent
    random_state=1992,
    n_jobs=-1  # Parallel execution
)
model.fit(X_train, y_train)


In [18]:
# Apply model and evaluate
predictions = model.predict(X_test)
rmse = mean_squared_error(y_test, predictions)**0.5

In [19]:
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.06843186773641313


# Save model

In [22]:
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "./models/sleep_quality_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)