# Predict on all merged data

In [1]:
import pandas as pd
import os 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import autosklearn.regression
from tpot import TPOTRegressor

os.chdir("/home/cernerrood246/University/DataMining")

#Relevant Models: LR, SVM, LightGBM, CatBoost, XGBoost 

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
participants_list = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17']
Merged_all_list = [pd.read_pickle(f"merged_dataset/{participant}/ALL.pkl") for participant in participants_list]
Merged_all_list[0]

Unnamed: 0_level_0,Time[s],Delta_Time,MET,value__bpm,value__confidence,X,Y,Z,Magnitude
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-12-03 16:58:51,1.0,0 days 00:00:00,1.310987,91.0,1.0,30.0,0.0,55.666667,63.235890
2021-12-03 16:58:52,2.0,0 days 00:00:01,1.310172,91.0,1.0,30.0,0.0,55.750000,63.309261
2021-12-03 16:58:53,3.0,0 days 00:00:02,1.522019,91.0,1.0,30.0,0.0,55.800000,63.353295
2021-12-03 16:58:54,4.0,0 days 00:00:03,1.627943,91.0,1.0,30.0,0.0,55.600000,63.177211
2021-12-03 16:58:55,5.0,0 days 00:00:04,1.563248,88.0,1.0,30.0,0.0,55.800000,63.353295
...,...,...,...,...,...,...,...,...,...
2021-12-03 17:28:48,1798.0,0 days 00:29:57,5.323530,139.0,1.0,20.2,-49.4,36.000000,64.377015
2021-12-03 17:28:49,1799.0,0 days 00:29:58,5.110387,139.0,1.0,18.0,-50.2,33.800000,63.138578
2021-12-03 17:28:50,1800.0,0 days 00:29:59,4.461832,142.0,2.0,17.8,-50.2,34.200000,63.297077
2021-12-03 17:28:51,1801.0,0 days 00:30:00,4.118857,142.0,2.0,15.2,-49.4,36.200000,63.101822


## Add Features

Possible: 
-Time[s]
-Lag feature for every value
-activity
-demographics 

Timeseries prediction 
-classify activity?

## All Data without timestamp

In [3]:
all_merged_df = pd.concat(Merged_all_list, axis=0)
all_merged_df = all_merged_df.dropna() # drops roughly 6k atm


In [4]:
X = all_merged_df[["value__bpm","value__confidence", "X", "Y", "Z", "Magnitude"]].to_numpy()
y = all_merged_df["MET"].to_numpy()

In [5]:


# Perform test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data with MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# List of models to evaluate, set LGMB to not output info

models = [LinearRegression(), SVR(), LGBMRegressor(verbose=-1), CatBoostRegressor(verbose=0), XGBRegressor()]
names = ["Linear Regression", "Support Vector Machine", "LightGBM", "CatBoost", "XGBoost"]

# Perform operations for each model
for model,name in zip(models, names):
    model.fit(X_train_scaled, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test_scaled)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name} Mean Squared Error: {mse}")


Linear Regression Mean Squared Error: 2.3550734383594145
Support Vector Machine Mean Squared Error: 1.6395782911619274
LightGBM Mean Squared Error: 0.9056028958261498
CatBoost Mean Squared Error: 0.7788736024328365
XGBoost Mean Squared Error: 0.7454982801858212


In [9]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
)
# Fit the AutoSklearnRegressor
automl.fit(X_train_scaled, y_train)

# Get the predicted values
y_pred = automl.predict(X_test_scaled)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"AutoSklearn Mean Squared Error: {mse}")

AutoSklearn Mean Squared Error: 5.15659864263747


In [10]:
print(automl.leaderboard())


          rank  ensemble_weight  type  cost duration
model_id                                            
1            1              1.0  <NA>  <NA>     <NA>


In [12]:
# Define the TPOTRegressor
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=2, random_state=42)

# Fit the TPOTRegressor
tpot.fit(X_train_scaled, y_train)

# Get the predicted values
y_pred = tpot.predict(X_test_scaled)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"TPOT Mean Squared Error: {mse}")


                                                                            
Generation 1 - Current best internal CV score: -0.7812734777455226
                                                                            
Generation 2 - Current best internal CV score: -0.7812734777455226
                                                                            
Generation 3 - Current best internal CV score: -0.7099590661212408
                                                                            
Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.35000000000000003, min_samples_leaf=7, min_samples_split=13, n_estimators=100)
TPOT Mean Squared Error: 0.786400666250984
