# Create model

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [4]:
# Load the dataset
data = pd.read_csv('../data/Sleep_Efficiency.csv')  # Replace with actual path

In [5]:
# Drop excluded attributes
excluded_attributes = [
    "Alcohol consumption", "Awakenings", "Caffeine consumption", 
    "ID", "Wakeup time", "Bedtime"
]
data = data.drop(columns=excluded_attributes)

In [6]:
data = pd.get_dummies(data, drop_first=True)
data

Unnamed: 0,Age,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Exercise frequency,Gender_Male,Smoking status_Yes
0,65,6.0,0.88,18,70,12,3.0,False,True
1,69,7.0,0.66,19,28,53,3.0,True,True
2,40,8.0,0.89,20,70,10,3.0,False,False
3,40,6.0,0.51,23,25,52,1.0,False,True
4,57,8.0,0.76,27,55,18,3.0,True,False
...,...,...,...,...,...,...,...,...,...
447,27,7.5,0.91,22,57,21,5.0,False,False
448,52,6.0,0.74,28,57,15,3.0,True,False
449,40,8.5,0.55,20,32,48,0.0,False,True
450,45,7.0,0.76,18,72,10,3.0,True,False


In [7]:
# Set 'Sleep efficiency' as the target variable
target = 'Sleep efficiency'
X = data.drop(columns=[target])
y = data[target]

In [8]:
# Split the data into training and testing sets
# You can mimic RapidMiner behavior of using full data by setting test_size=0, but generally use some split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2001)

In [9]:
# Train the Random Forest model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=4,
    min_samples_leaf=2,
    criterion='squared_error',  # 'least_square' equivalent
    random_state=1992,
    n_jobs=-1  # Parallel execution
)
model.fit(X_train, y_train)


In [10]:
# Apply model and evaluate
predictions = model.predict(X_test)
rmse = mean_squared_error(y_test, predictions)**0.5

In [15]:
X_test

Unnamed: 0,Age,Sleep duration,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Exercise frequency,Gender_Male,Smoking status_Yes
335,44,10.0,28,60,12,1.0,False,True
389,36,8.0,25,55,20,4.0,False,False
263,23,9.0,26,56,18,1.0,False,False
348,57,7.0,22,65,13,1.0,True,True
248,25,7.5,24,63,13,0.0,True,True
...,...,...,...,...,...,...,...,...
269,27,7.0,22,57,21,2.0,True,False
69,40,6.0,20,35,45,3.0,True,True
31,18,8.0,22,30,48,0.0,True,False
266,24,7.0,20,32,48,0.0,True,True


In [11]:
predictions

array([0.82984911, 0.86702163, 0.76977833, 0.8542644 , 0.84547372,
       0.54572636, 0.84822982, 0.86515173, 0.895231  , 0.87731546,
       0.8720162 , 0.85147372, 0.8575952 , 0.83761108, 0.8538437 ,
       0.89355299, 0.89533453, 0.86565387, 0.8747151 , 0.87788309,
       0.88166871, 0.89924244, 0.55078405, 0.83883444, 0.84314026,
       0.78271236, 0.84644624, 0.87354181, 0.83655254, 0.84312409,
       0.86592153, 0.53251381, 0.60099738, 0.6316194 , 0.54171833,
       0.82434232, 0.87334284, 0.84568387, 0.86171581, 0.78039754,
       0.83194447, 0.8576645 , 0.53831232, 0.82352294, 0.82597059,
       0.92470546, 0.86230548, 0.87939815, 0.86148202, 0.85666199,
       0.83392817, 0.87777807, 0.91034847, 0.88279936, 0.84749369,
       0.66440964, 0.84190562, 0.53648012, 0.90257601, 0.81563523,
       0.54300083, 0.56051667, 0.90195103, 0.89505278, 0.63494643,
       0.54572636, 0.86508266, 0.86397694, 0.84566728, 0.61634151,
       0.90368205, 0.80990284, 0.53301893, 0.84528377, 0.88900

In [12]:
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.06863502301894572


# Save model

In [13]:
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "sleep_quality_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)