# trying out more features 

In [5]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [10]:
# File paths (to be restored if files are re-uploaded)
file_paths = [
    "../data/gukil/data1.csv",
    "../data/gukil/data3.csv",
    "../data/gukil/data2.csv",
    "../data/gukil/data4.csv",
    "../data/gukil/data5.csv",
    "../data/gukil/data6.csv",
]

In [11]:
# Advanced feature extractor for a heart rate segment
def extract_features(hr_series):
    hr_series = hr_series.dropna().values
    if len(hr_series) < 2:
        return {}

    hr_diff = np.diff(hr_series)
    return {
        'mean': np.mean(hr_series),
        'std': np.std(hr_series),
        'min': np.min(hr_series),
        'max': np.max(hr_series),
        'range': np.ptp(hr_series),
        'slope': np.polyfit(np.arange(len(hr_series)), hr_series, 1)[0],
        'skew': scipy.stats.skew(hr_series),
        'kurtosis': scipy.stats.kurtosis(hr_series),
        'median': np.median(hr_series),
        'rmssd': np.sqrt(np.mean(np.square(hr_diff))),
        'mean_abs_change': np.mean(np.abs(hr_diff)),
        'max_diff': np.max(np.abs(hr_diff)),
        'iqr': np.percentile(hr_series, 75) - np.percentile(hr_series, 25)
    }

# Process labeled segments (label 0 or 1) from each CSV
def extract_labeled_segments(file_path):
    df = pd.read_csv(file_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)

    segments = []
    for label in [0, 1]:
        hr_segment = df[df['label'] == label]['heart_rate']
        if not hr_segment.empty:
            features = extract_features(hr_segment)
            features['label'] = label
            segments.append(features)
    return segments

In [12]:
pwd

'/Users/kshitijpawar/Desktop/Kshitij/spring_2025/csci534/project/Detecting-hunger-apple-watch/model'

In [13]:
# Process all uploaded files
all_segments = []
for path in file_paths:
    all_segments.extend(extract_labeled_segments(path))

# Prepare dataset
segment_df = pd.DataFrame(all_segments)
X = segment_df.drop(columns=['label'])
y = segment_df['label'].values

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# Train and evaluate models
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [15]:
results = {}
for name, model in models.items():
    if len(np.unique(y)) > 1:
        scores = cross_val_score(model, X_scaled, y, cv=min(3, len(y)))
        results[name] = {
            "accuracy_mean": scores.mean(),
            "accuracy_std": scores.std()
        }
    else:
        results[name] = {
            "accuracy_mean": None,
            "accuracy_std": None,
            "error": "Only one class present in labels"
        }

In [16]:
results

{'Random Forest': {'accuracy_mean': np.float64(0.5),
  'accuracy_std': np.float64(0.0)},
 'Logistic Regression': {'accuracy_mean': np.float64(0.5),
  'accuracy_std': np.float64(0.2041241452319315)},
 'XGBoost': {'accuracy_mean': np.float64(0.5833333333333334),
  'accuracy_std': np.float64(0.11785113019775792)}}