# Predict on all merged data

In [1]:
import pandas as pd
import os 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import autosklearn.regression
from tpot import TPOTRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import mean_squared_error

os.chdir("/home/cernerrood246/University/DataMining")

#Relevant Models: LR, SVM, LightGBM, CatBoost, XGBoost 

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Feature engineering

In [2]:
participants_list = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17'] 

Merged_all_list = []
for participant in participants_list:
    df = pd.read_pickle(f"merged_dataset/{participant}/ALL.pkl")
    df['Participant'] = participant
    Merged_all_list.append(df)


for participant in Merged_all_list:
    #add lag features
    relevant_columns = ["value__bpm", "value__confidence", "X", "Y", "Z", "Magnitude"]
    X_columns = relevant_columns.copy()
    X_columns.extend(['Activity','BMI', 'Fat', 'Muscle',
       'Water', 'Bone', 'Weight', 'Height', 'Age', 'Gender_F', 'Gender_M'])
    #fill missing values 
    participant.ffill(inplace=True)
    participant.bfill(inplace=True)
    for column in relevant_columns:
        for i in range(1, 10, 2):
            participant[f"{column}_lag_{i}"] = participant[column].shift(i)
            X_columns.append(f"{column}_lag_{i}")
            participant.bfill(inplace=True)
            participant.ffill(inplace=True)
    

## Add Features

Possible: 
-Time[s]
-Lag feature for every value
-activity
-demographics 

Timeseries prediction 
-classify activity?

## All Data without timestamp

In [3]:
n_splits = 4

parameters = dict([('colsample_bytree', 0.6174372236293935),
             ('learning_rate', 0.03335590817699578),
             ('max_depth', 3),
             ('min_child_weight', 2.8157820836606042),
             ('subsample', 0.5)])

all_merged_df = pd.concat(Merged_all_list, axis=0)

#all_merged_df = all_merged_df.dropna() # drops roughly 6k atm
print(len(all_merged_df))
X = all_merged_df[X_columns].to_numpy()
y = all_merged_df["MET"].to_numpy()

# Get the list of participants
participants = all_merged_df['Participant']

# Initialize the GroupKFold object
group_kfold = GroupKFold(n_splits=n_splits)
model = XGBRegressor(**parameters)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


27656


In [4]:
# Perform cross-validation and calculate mean squared error
scores = cross_val_score(model, X, y, groups=participants, cv=group_kfold, scoring='neg_mean_squared_error', n_jobs=-1)
print("Cross-validation scores:", -scores)
mse = -scores.mean()

print("Mean squared error:", mse)

Cross-validation scores: [1.52515207 2.10864327 1.48383146 1.08943367]
Mean squared error: 1.551765117732363


In [82]:
for i,group in enumerate(group_kfold.split(X, y, participants)):
    print(f"Fold {i}")
    print(f"Train size: {group[0].size}")
    print(f"Test size: {group[1].size}")
    print(f"Train participants: {participants.iloc[group[0]].unique()}")
    print(f"Test participants: {participants.iloc[group[1]].unique()}")

Fold 0
Train size: 20991
Test size: 6665
Train participants: ['P1' 'P3' 'P4' 'P6' 'P7' 'P8' 'P9' 'P11' 'P12' 'P13' 'P14' 'P16' 'P17']
Test participants: ['P2' 'P5' 'P10' 'P15']
Fold 1
Train size: 20861
Test size: 6795
Train participants: ['P1' 'P2' 'P5' 'P6' 'P7' 'P8' 'P9' 'P10' 'P13' 'P14' 'P15' 'P16' 'P17']
Test participants: ['P3' 'P4' 'P11' 'P12']
Fold 2
Train size: 20848
Test size: 6808
Train participants: ['P2' 'P3' 'P4' 'P5' 'P7' 'P8' 'P10' 'P11' 'P12' 'P13' 'P14' 'P15' 'P16']
Test participants: ['P1' 'P6' 'P9' 'P17']
Fold 3
Train size: 20268
Test size: 7388
Train participants: ['P1' 'P2' 'P3' 'P4' 'P5' 'P6' 'P9' 'P10' 'P11' 'P12' 'P15' 'P17']
Test participants: ['P7' 'P8' 'P13' 'P14' 'P16']


In [65]:

#parameters chosen from https://datascience.stackexchange.com/questions/108233/recommendations-for-tuning-xgboost-hyperparams
#https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV

group_kfold = GroupKFold(n_splits=n_splits)

opt = BayesSearchCV(
    XGBRegressor(),
    {
        'max_depth': Integer(3, 10),
        'min_child_weight': Real(0.5, 5),
        'subsample': Real(0.5, 1),
        'colsample_bytree': Real(0.01, 1),
        'learning_rate':Real(0.01, 1),
    },
    n_iter=50,
    cv=group_kfold.get_n_splits(X, y, participants),
    scoring = 'neg_mean_squared_error',
    verbose = 2,
    n_jobs=-1

)

opt.fit(X, y)

print("val. score: %s" % opt.best_score_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] END colsample_bytree=0.03064363546483409, learning_rate=0.6170321230402298, max_depth=4, min_child_weight=4.825734617953395, subsample=0.9340952781120233; total time=   0.6s
[CV] END colsample_bytree=0.03064363546483409, learning_rate=0.6170321230402298, max_depth=4, min_child_weight=4.825734617953395, subsample=0.9340952781120233; total time=   0.7s
[CV] END colsample_bytree=0.03064363546483409, learning_rate=0.6170321230402298, max_depth=4, min_child_weight=4.825734617953395, subsample=0.9340952781120233; total time=   0.7s
[CV] END colsample_bytree=0.03064363546483409, learning_rate=0.6170321230402298, max_depth=4, min_child_weight=4.825734617953395, subsample=0.9340952781120233; total time=   0.7s
Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] END colsample_bytree=0.9499807530871895, learning_rate=0.4854750514356732, max_depth=9, min_child_weight=0.9910330054928767, subsample=0.5553373551551177; total 

In [None]:
-2.080954642766562
OrderedDict([('colsample_bytree', 0.01),
             ('learning_rate', 0.09154135084696965),
             ('max_depth', 3),
             ('min_child_weight', 5.0),
             ('subsample', 0.5)])

In [66]:
opt.best_params_

OrderedDict([('colsample_bytree', 0.6174372236293935),
             ('learning_rate', 0.03335590817699578),
             ('max_depth', 3),
             ('min_child_weight', 2.8157820836606042),
             ('subsample', 0.5)])

## Archive

In [18]:
# List of models to evaluate, set LGMB to not output info

models = [XGBRegressor()] #LinearRegression(), SVR(), LGBMRegressor(verbose=-1), CatBoostRegressor(verbose=0),
names = ["XGBoost"] #"Linear Regression", "Support Vector Machine", "LightGBM", "CatBoost", 
models_tuned = dict()

# Perform operations for each model
for model,name in zip(models, names):
    model.fit(X_train_scaled, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test_scaled)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name} Mean Squared Error: {mse}")
    models_tuned[name] = model


XGBoost Mean Squared Error: 0.11158988484317221


In [55]:
participant = "P3"

X_train, X_test, y_train, y_test = all_merged_df[all_merged_df["Participant"] != participant][X_columns].to_numpy(), all_merged_df[all_merged_df["Participant"] == participant][X_columns].to_numpy(), all_merged_df[all_merged_df["Participant"] != participant]["MET"].to_numpy(), all_merged_df[all_merged_df["Participant"] == participant]["MET"].to_numpy()
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = XGBRegressor()
model.fit(X_train_scaled, y_train)
    
# Predict on test data
y_pred = model.predict(X_test_scaled)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"{name} Mean Squared Error: {mse}")


XGBoost Mean Squared Error: 3.676475017709935


In [58]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
)
# Fit the AutoSklearnRegressor
automl.fit(X_train_scaled, y_train)

# Get the predicted values
y_pred = automl.predict(X_test_scaled)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"AutoSklearn Mean Squared Error: {mse}")



In [97]:
# Define the TPOTRegressor
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=2, random_state=42, cv = group_kfold.get_n_splits(X, y, participants))

# Fit the TPOTRegressor
tpot.fit(X, y)

# # Get the predicted values
# y_pred = tpot.predict(X_test_scaled)

# # Calculate the mean squared error
# mse = mean_squared_error(y_test, y_pred)
# print(f"TPOT Mean Squared Error: {mse}")


                                                                              
Generation 1 - Current best internal CV score: -2.274544481594872
                                                                            
Generation 2 - Current best internal CV score: -2.2734384906592924
                                                                            
Generation 3 - Current best internal CV score: -2.2656305712578018
                                                                            
Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.05, min_samples_leaf=7, min_samples_split=16, n_estimators=100)
