In [None]:
from google.colab import drive
drive.mount('/content/drive')  

Mounted at /content/drive


----

------------






### Random Forest + SelectKBest

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif


# load training data
train_df = pd.read_csv("../gender_tsfel_merged/train_filled.csv")

# drop out the columns which are not features
exclude_cols = ["Activity", "Subject", "Window", "set", "trial_id",
                "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
                "age", "height_(cm)", "weight_(kg)", "most_affected_side",
                "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
                "filename", "subject"]

feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# X_train = train_df[feature_cols]
X_raw = train_df[feature_cols]
y_train = train_df["Activity"]

# create pipeline
pipeline_new = Pipeline([
    ("vt", VarianceThreshold(threshold=0.0)),
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(score_func=f_classif)),
    # ("feature_selection", SelectKBest(score_func=mutual_info_classif)),
    ("classifier", RandomForestClassifier(class_weight="balanced", random_state=42))
])

# define the grid parameters
param_grid3 = {
    "feature_selection__k": [30, 50, 70, 100],  
    "classifier__n_estimators": [100, 150, 200],  
    "classifier__max_depth": [2, 3, 4],  
    "classifier__min_samples_split": [2, 5],  
    "classifier__min_samples_leaf": [2, 3],  
    "classifier__max_features": ["sqrt", "log2"]  
}

# apply grid searchh
grid_search = GridSearchCV(
    estimator=pipeline_new,
    param_grid=param_grid3,
    cv=3,
    scoring="accuracy",
    verbose=2,
    n_jobs=-1
)

# grid_search.fit(X_train, y_train)
grid_search.fit(X_raw, y_train)


# output
print("Best Hyperparameters:")
print(grid_search.best_params_)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## get best model
best_model_tsfel = grid_search.best_estimator_

# cols
val_df = pd.read_csv("../gender_tsfel_merged/val_filled.csv")
X_val_raw = val_df[feature_cols]
y_val = val_df["Activity"]

# predict
y_pred = best_model_tsfel.predict(X_val_raw)

# result
print("Classification Report:")
print(classification_report(y_val, y_pred))

####
train_score = best_model_tsfel.score(X_raw, y_train)
val_score = best_model_tsfel.score(X_val_raw, y_val)

print(f"Training accuracy: {train_score:.4f}")
print(f"Validation accuracy: {val_score:.4f}")


----------

### HistGradientBoosting + PCA

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.decomposition import PCA

# load training data
train_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/train_filled.csv")

# drop out the columns which are not features
exclude_cols = ["Activity", "Subject", "Window", "set", "trial_id",
        "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
        "age", "height_(cm)", "weight_(kg)", "most_affected_side",
        "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
        "filename", "subject"]

feature_cols = [col for col in train_df.columns if col not in exclude_cols]

X_raw = train_df[feature_cols]
y_train = train_df["Activity"]

# create pipeline
pipeline_gbt = Pipeline([
    ("vt", VarianceThreshold(threshold=0.0)), # remove constant features
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", HistGradientBoostingClassifier(
        early_stopping=True,
        random_state=42,
        validation_fraction=0.1,
        n_iter_no_change=10
    ))
])

# define the grid parameters
param_grid_gbt = {
    "pca__n_components": [20, 25, 50, 0.95],
    "classifier__max_iter": [50, 100],
    "classifier__learning_rate": [0.01, 0.05],
    "classifier__max_depth": [2, 3],
    "classifier__l2_regularization": [70.0, 100.0]
}

# apply grid searchh
grid_search = GridSearchCV(
    estimator=pipeline_gbt,
    param_grid=param_grid_gbt,
    cv=3,
    # scoring="accuracy",
    # scoring="f1_macro",
    scoring="balanced_accuracy",
    # verbose=2, 
    n_jobs=-1
)

# calculate sample_weight
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
# training
grid_search.fit(X_raw, y_train, classifier__sample_weight=sample_weights)

# output
print("Best Hyperparameters:")
print(grid_search.best_params_)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import balanced_accuracy_score

# 拿到最佳模型
best_model = grid_search.best_estimator_

# delete the constant features
val_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/val_filled.csv")
X_val_raw = val_df[feature_cols]
y_val = val_df["Activity"]

# predict
y_pred = best_model.predict(X_val_raw)

# check the Accuracy and F1 score
train_preds = best_model.predict(X_raw)
train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, y_pred)
train_f1 = f1_score(y_train, train_preds, average="macro")
val_f1 = f1_score(y_val, y_pred, average="macro")

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Val Accuracy: {val_acc:.4f}")
print(f"Train Macro F1: {train_f1:.4f}")
print(f"Val Macro F1: {val_f1:.4f}")

# ============
# check the balanced accuracy
val_bal_acc = balanced_accuracy_score(y_val, y_pred)
train_bal_acc = balanced_accuracy_score(y_train, train_preds)

print(f"\nTrain Balanced Accuracy: {train_bal_acc:.4f}")
print(f"Val Balanced Accuracy:   {val_bal_acc:.4f}")
####

# classification report
print("\nClassification Report: ")
print(classification_report(y_val, y_pred))

# confusion matrix
labels = sorted(np.unique(y_val))
cm = confusion_matrix(y_val, y_pred, labels=labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix (Validation Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

------------------------

----------------------



### HistGradientBoosting + SelectKBest

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.decomposition import PCA


# load training data
train_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/train_filled.csv")

# drop out the columns which are not features
exclude_cols = ["Activity", "Subject", "Window", "set", "trial_id",
                "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
                "age", "height_(cm)", "weight_(kg)", "most_affected_side",
                "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
                "filename", "subject"]

feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# X_train = train_df[feature_cols]
X_raw = train_df[feature_cols]
y_train = train_df["Activity"]

# create pipeline
pipeline_gbt = Pipeline([
    ("vt", VarianceThreshold(threshold=0.0)),
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(score_func=f_classif)),
    # ("feature_selection", SelectKBest(score_func=mutual_info_classif)),
    ("classifier", HistGradientBoostingClassifier(
        early_stopping=True,
        random_state=42,
        validation_fraction=0.1,
        n_iter_no_change=10
    ))
])

# define the grid parameters
param_grid_gbt = {
    "feature_selection__k": [70, 100],
    "classifier__max_iter": [50, 100, 200],
    "classifier__learning_rate": [0.01, 0.05],
    "classifier__max_depth": [2, 3, 4],
    "classifier__l2_regularization": [5.0, 10.0, 30.0, 70.0]
}

# apply grid searchh
grid_search = GridSearchCV(
    estimator=pipeline_gbt,
    param_grid=param_grid_gbt,
    cv=3,
    # scoring="accuracy", 
    # scoring="f1_macro",
    scoring="balanced_accuracy",
    # verbose=2, ##
    n_jobs=-1
)

# calculate sample_weight
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
# training
grid_search.fit(X_raw, y_train, classifier__sample_weight=sample_weights)

# output
print("Best Hyperparameters:")
print(grid_search.best_params_)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import balanced_accuracy_score


# 拿到最佳模型
best_model = grid_search.best_estimator_

# delete the constant features
val_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/val_filled.csv")
X_val_raw = val_df[feature_cols]
y_val = val_df["Activity"]

y_pred = best_model.predict(X_val_raw)

# check the Accuracy and F1 score
train_preds = best_model.predict(X_raw)
train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, y_pred)
train_f1 = f1_score(y_train, train_preds, average="macro")
val_f1 = f1_score(y_val, y_pred, average="macro")

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Val Accuracy: {val_acc:.4f}")
print(f"Train Macro F1: {train_f1:.4f}")
print(f"Val Macro F1: {val_f1:.4f}")

# ============
# check the balanced accuracy
val_bal_acc = balanced_accuracy_score(y_val, y_pred)
train_bal_acc = balanced_accuracy_score(y_train, train_preds)

print(f"\n Train Balanced Accuracy: {train_bal_acc:.4f}")
print(f"Val Balanced Accuracy:   {val_bal_acc:.4f}")
####

# classification report
print("\nClassification Report: ")
print(classification_report(y_val, y_pred))

# confusion matrix
labels = sorted(np.unique(y_val))
cm = confusion_matrix(y_val, y_pred, labels=labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix (Validation Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

----------------------

--------------------


### LightGBM + PCA

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight

# load training data
# train_df = pd.read_csv("../gender_tsfel_merged/train_filled.csv")
train_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/train_filled.csv")

# drop out the columns which are not features
exclude_cols = ["Activity", "Subject", "Window", "set", "trial_id",
                "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
                "age", "height_(cm)", "weight_(kg)", "most_affected_side",
                "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
                "filename", "subject"]

feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# X_train = train_df[feature_cols]
X_raw = train_df[feature_cols]
y_train = train_df["Activity"]#############

# create pipeline
pipeline_lgbm = Pipeline([
    ("vt", VarianceThreshold(threshold=0.0)),
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", LGBMClassifier(
        objective='multiclass',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# define the grid parameters
# l2 regularization
param_grid_lgbm = {
    "pca__n_components": [0.95, 50, 30],
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [2, 3, 4],
    "classifier__learning_rate": [0.01, 0.05],
    "classifier__reg_lambda": [1.0, 5.0, 10.0],
    "classifier__min_child_samples": [20, 50, 80]
}

# apply grid searchh
grid_search = GridSearchCV(
    estimator=pipeline_lgbm,
    param_grid=param_grid_lgbm,
    cv=3,
    scoring="f1_macro",
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_raw, y_train)

# output
print("Best Hyperparameters:")
print(grid_search.best_params_)

In [None]:
# results
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import balanced_accuracy_score

# get the best model
best_model = grid_search.best_estimator_

##################
# delete the constant feautres
# val_df = pd.read_csv("../gender_tsfel_merged/val_filled.csv")
val_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/val_filled.csv")

X_val_raw = val_df[feature_cols]
y_val = val_df["Activity"]

# use X_val_filtered after selector.transform
y_pred = best_model.predict(X_val_raw)

# get the Accuracy and F1 score
train_preds = best_model.predict(X_raw)
train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, y_pred)
train_f1 = f1_score(y_train, train_preds, average="macro")
val_f1 = f1_score(y_val, y_pred, average="macro")

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Val Accuracy: {val_acc:.4f}")
print(f"Train Macro F1: {train_f1:.4f}")
print(f"Val Macro F1: {val_f1:.4f}")

############## -----
# switch to balance accuracy
val_bal_acc = balanced_accuracy_score(y_val, y_pred)
train_bal_acc = balanced_accuracy_score(y_train, train_preds)

print(f"\nTrain Balanced Accuracy: {train_bal_acc:.4f}")
print(f"Val Balanced Accuracy: {val_bal_acc:.4f}")

# classification report
print("\nClassification Report: ")
print(classification_report(y_val, y_pred))

# confusion matrix
labels = sorted(np.unique(y_val))
cm = confusion_matrix(y_val, y_pred, labels=labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix (Validation Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

---------------------------------------


-----------------------------


### LightGBM + SelectKBest

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils.class_weight import compute_sample_weight

# load the dataset
# train_df = pd.read_csv("../gender_tsfel_merged/train_filled.csv")
train_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/train_filled.csv")


# exclude the unnecessary cols
exclude_cols = ["Activity", "Subject", "Window", "set", "trial_id",
                "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
                "age", "height_(cm)", "weight_(kg)", "most_affected_side",
                "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
                "filename", "subject"]
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

X_raw = train_df[feature_cols]
y_train = train_df["Activity"]

# create LightGBM pepeline + SelectKBest
pipeline_lgbm = Pipeline([
    ("vt", VarianceThreshold(threshold=0.0)),
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(score_func=mutual_info_classif)),
    ("classifier", LGBMClassifier(
        objective='multiclass',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# define the hyperparameters
param_grid_lgbm = {
    "feature_selection__k": [20, 30, 40],  ## 70, [30, 50]
    "classifier__n_estimators": [80], ## 100, 50
    "classifier__max_depth": [2, 3],
    "classifier__learning_rate": [0.01, 0.05],
    "classifier__reg_lambda": [30.0, 50.0, 70.0],  ## 10.0
    "classifier__min_child_samples": [100]   ## 50
}

# grid search
grid_search = GridSearchCV(
    estimator=pipeline_lgbm,
    param_grid=param_grid_lgbm,
    cv=3,
    scoring="f1_macro",   # accuracy, balanced accuracy
    n_jobs=-1
)

grid_search.fit(X_raw, y_train)

# output
print("Best Hyperparameters:")
print(grid_search.best_params_)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import balanced_accuracy_score


# load file
# val_df = pd.read_csv("../gender_tsfel_merged/val_filled.csv")
val_df = pd.read_csv("/content/drive/My Drive/final_project/gender_tsfel_merged/val_filled.csv")
X_val_raw = val_df[feature_cols]
y_val = val_df["Activity"]

# pred
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val_raw)  
train_preds = best_model.predict(X_raw)  

# check the accuracy and f1 score
print(f"\nTrain Accuracy: {accuracy_score(y_train, train_preds):.4f}")
print(f"Val Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"Train Macro F1: {f1_score(y_train, train_preds, average='macro'):.4f}")
print(f"Val Macro F1: {f1_score(y_val, y_pred, average='macro'):.4f}")
print(f"Train Balanced Accuracy: {balanced_accuracy_score(y_train, train_preds):.4f}")
print(f"Val Balanced Accuracy: {balanced_accuracy_score(y_val, y_pred):.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# confusion matrix
labels = sorted(np.unique(y_val))
cm = confusion_matrix(y_val, y_pred, labels=labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix (Validation Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### LOSO-CV
RF

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA


import warnings
warnings.filterwarnings("ignore")


# --- 读数据并合并 train+val ---
train_df = pd.read_csv("/content/drive/My Drive/final_project/feature_engineering/gender_merged/train_filled.csv")
val_df   = pd.read_csv("/content/drive/My Drive/final_project/feature_engineering/gender_merged/val_filled.csv")
comb_df  = pd.concat([train_df, val_df], ignore_index=True)

# --- 特征/标签/分组 ---
drop_cols = ["Activity", "Subject", "Window", "set", "trial_id",
             "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
             "age", "height_(cm)", "weight_(kg)", "most_affected_side",
             "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
             "filename", "subject"]

X = comb_df.drop(columns=drop_cols)
y = comb_df["Activity"]
groups = comb_df["Subject"]

# --- 只用 RF 的 Pipeline（树模型不需要Scaler/PCA）---
pipeline_rf = Pipeline([
    ("vt", VarianceThreshold(threshold=0.0)),
    # ("pca", PCA(n_components=20, random_state=42)),
    ("rf", RandomForestClassifier(
        n_estimators=200,        # 你可以改成你调好的数
        max_depth=3,
        min_samples_leaf=2,
        min_samples_split=2,
        max_features="sqrt",
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ))
])

# --- LOSO CV ---
logo = LeaveOneGroupOut()
y_true_all, y_pred_all = [], []

for tr, va in logo.split(X, y, groups):
    X_tr, X_va = X.iloc[tr], X.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    pipeline_rf.fit(X_tr, y_tr)
    y_pred = pipeline_rf.predict(X_va)

    y_true_all.extend(y_va)
    y_pred_all.extend(y_pred)

# results
print("LOSO Evaluation:")
print(f"Accuracy: {accuracy_score(y_true_all, y_pred_all):.4f}")
print(f"Macro F1 Score: {f1_score(y_true_all, y_pred_all, average='macro'):.4f}")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_true_all, y_pred_all):.4f}")

print("\nClassification Report:")
print(classification_report(y_true_all, y_pred_all))

# confusion matrix
conf_mat = confusion_matrix(y_true_all, y_pred_all)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (LOSO)")
plt.show()


----

#### LOSO 
less overfitting  
LightGBM + SelectKBest (Ver3)  

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import LeaveOneGroupOut
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
# import matplotlib.pyplot as plt
# import seaborn as sns
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# import warnings
# warnings.filterwarnings("ignore")

# # get data set
# train_df = pd.read_csv("../gender_tsfel_merged/train_filled.csv")
# val_df = pd.read_csv("../gender_tsfel_merged/val_filled.csv")
# comb_df = pd.concat([train_df, val_df], ignore_index=True)

# # drop the unnecessary cols
# drop_cols = ["Activity", "Subject", "Window", "set", "trial_id",
#              "sex", "most_affected_wrist", "most_affected_ankle", "dominant_side",
#              "age", "height_(cm)", "weight_(kg)", "most_affected_side",
#              "hoehn_and_yahr_(stage)", "years_since_diagnosis", "cit", "updrs",
#              "filename", "subject"]

# X = comb_df.drop(columns=drop_cols)
# y = comb_df["Activity"]
# groups = comb_df["Subject"]

# # pipeline with best model and hyperparameters
# pipeline_lgbm_kbest = Pipeline([
#     ("vt", VarianceThreshold(threshold=0.0)),
#     ("select", SelectKBest(score_func=f_classif, k=40)),
#     ("scaler", StandardScaler()),
#     ("clf", LGBMClassifier(
#         objective="multiclass",
#         learning_rate=0.01,
#         max_depth=2,
#         n_estimators=100,
#         reg_lambda=70.0,
#         min_child_samples=100,
#         class_weight="balanced",
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

# # LOSO CV
# logo = LeaveOneGroupOut()
# y_true_all, y_pred_all = [], []

# for train_idx, val_idx in logo.split(X, y, groups):
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#     pipeline_lgbm_kbest.fit(X_train, y_train)
#     y_pred = pipeline_lgbm_kbest.predict(X_val)

#     y_true_all.extend(y_val)
#     y_pred_all.extend(y_pred)

# # resluts:
# print("LOSO Evaluation:")
# print(f"Accuracy:  {accuracy_score(y_true_all, y_pred_all):.4f}")
# print(f"Macro F1 Score: {f1_score(y_true_all, y_pred_all, average='macro'):.4f}")
# print(f"Balanced Accuracy: {balanced_accuracy_score(y_true_all, y_pred_all, average='macro'):.4f}")

# print("\nClassification Report:")
# print(classification_report(y_true_all, y_pred_all))

# # confusion matrix
# conf_mat = confusion_matrix(y_true_all, y_pred_all)
# sns.heatmap(conf_mat, annot=True, fmt='d', cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
# plt.xlabel("Predicted")
# plt.ylabel("True")
# plt.title("Confusion Matrix - LOSO")
# plt.show()