# Predict on all merged data

In [1]:
import pandas as pd
import os 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import autosklearn.regression
from tpot import TPOTRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

os.chdir("/home/cernerrood246/University/DataMining")

#Relevant Models: LR, SVM, LightGBM, CatBoost, XGBoost 

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Feature engineering

In [2]:
participants_list = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17'] 

Merged_all_list = []
for participant in participants_list:
    df = pd.read_pickle(f"merged_dataset/{participant}/ALL.pkl")
    df['Participant'] = participant
    Merged_all_list.append(df)


for participant in Merged_all_list:
    #add lag features
    relevant_columns = ["value__bpm", "value__confidence", "X", "Y", "Z", "Magnitude"]
    X_columns = relevant_columns.copy()
    #fill missing values 
    participant.ffill(inplace=True)
    participant.bfill(inplace=True)
    for column in relevant_columns:
        for i in range(-10, 10, 2):
            participant[f"{column}_lag_{i}"] = participant[column].shift(i)
            X_columns.append(f"{column}_lag_{i}")
            participant.bfill(inplace=True)
            participant.ffill(inplace=True)
    

## Add Features

Possible: 
-Time[s]
-Lag feature for every value
-activity
-demographics 

Timeseries prediction 
-classify activity?

## All Data without timestamp

In [3]:
all_merged_df = pd.concat(Merged_all_list, axis=0)

#all_merged_df = all_merged_df.dropna() # drops roughly 6k atm
print(len(all_merged_df))
X = all_merged_df[X_columns].to_numpy()
y = all_merged_df["MET"].to_numpy()
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import mean_squared_error
# Get the list of participants
participants = all_merged_df['Participant']

# Initialize the GroupKFold object
group_kfold = GroupKFold(n_splits=2)

model = XGBRegressor()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Perform cross-validation and calculate mean squared error
scores = cross_val_score(model, X, y, groups=participants, cv=group_kfold, scoring='neg_mean_squared_error')
print("Cross-validation scores:", -scores)
mse = -scores.mean()

print("Mean squared error:", mse)

27656
Cross-validation scores: [2.52608937 3.5918013 ]
Mean squared error: 3.058945333462028


In [6]:

#parameters chosen from https://datascience.stackexchange.com/questions/108233/recommendations-for-tuning-xgboost-hyperparams
#https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV

group_kfold = GroupKFold(n_splits=5)

opt = BayesSearchCV(
    XGBRegressor(),
    {
        'max_depth': Integer(3, 10),
        'min_child_weight': Real(0.5, 5),
        'subsample': Real(0.5, 1),
        'colsample_bytree': Real(0.01, 1),
        'learning_rate':Real(0.01, 1),
    },
    n_iter=30,
    cv=group_kfold.get_n_splits(X, y, participants),
    scoring = 'neg_mean_squared_error',
    verbose = 2
)

opt.fit(X, y)

print("val. score: %s" % opt.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END colsample_bytree=0.19448905521263748, learning_rate=0.4131629110854415, max_depth=8, min_child_weight=2.3169967068192054, subsample=0.8948130250022777; total time=  15.6s
[CV] END colsample_bytree=0.19448905521263748, learning_rate=0.4131629110854415, max_depth=8, min_child_weight=2.3169967068192054, subsample=0.8948130250022777; total time=  10.8s
[CV] END colsample_bytree=0.19448905521263748, learning_rate=0.4131629110854415, max_depth=8, min_child_weight=2.3169967068192054, subsample=0.8948130250022777; total time=  14.0s
[CV] END colsample_bytree=0.19448905521263748, learning_rate=0.4131629110854415, max_depth=8, min_child_weight=2.3169967068192054, subsample=0.8948130250022777; total time=  15.4s
[CV] END colsample_bytree=0.19448905521263748, learning_rate=0.4131629110854415, max_depth=8, min_child_weight=2.3169967068192054, subsample=0.8948130250022777; total time=  12.1s
Fitting 5 folds for each of 1 candidates,

In [17]:
# Perform test-train split
#leave one out validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data with MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
# List of models to evaluate, set LGMB to not output info

models = [XGBRegressor()] #LinearRegression(), SVR(), LGBMRegressor(verbose=-1), CatBoostRegressor(verbose=0),
names = ["XGBoost"] #"Linear Regression", "Support Vector Machine", "LightGBM", "CatBoost", 
models_tuned = dict()

# Perform operations for each model
for model,name in zip(models, names):
    model.fit(X_train_scaled, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test_scaled)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name} Mean Squared Error: {mse}")
    models_tuned[name] = model


XGBoost Mean Squared Error: 0.11158988484317221


In [55]:
participant = "P3"

X_train, X_test, y_train, y_test = all_merged_df[all_merged_df["Participant"] != participant][X_columns].to_numpy(), all_merged_df[all_merged_df["Participant"] == participant][X_columns].to_numpy(), all_merged_df[all_merged_df["Participant"] != participant]["MET"].to_numpy(), all_merged_df[all_merged_df["Participant"] == participant]["MET"].to_numpy()
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = XGBRegressor()
model.fit(X_train_scaled, y_train)
    
# Predict on test data
y_pred = model.predict(X_test_scaled)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"{name} Mean Squared Error: {mse}")


XGBoost Mean Squared Error: 3.676475017709935


In [58]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
)
# Fit the AutoSklearnRegressor
automl.fit(X_train_scaled, y_train)

# Get the predicted values
y_pred = automl.predict(X_test_scaled)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"AutoSklearn Mean Squared Error: {mse}")



In [12]:
# Define the TPOTRegressor
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=2, random_state=42)

# Fit the TPOTRegressor
tpot.fit(X_train_scaled, y_train)

# Get the predicted values
y_pred = tpot.predict(X_test_scaled)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"TPOT Mean Squared Error: {mse}")


                                                                            
Generation 1 - Current best internal CV score: -0.7812734777455226
                                                                            
Generation 2 - Current best internal CV score: -0.7812734777455226
                                                                            
Generation 3 - Current best internal CV score: -0.7099590661212408
                                                                            
Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.35000000000000003, min_samples_leaf=7, min_samples_split=13, n_estimators=100)
TPOT Mean Squared Error: 0.786400666250984
