In [None]:
import pandas as pd
import numpy as np
from scipy import fft, stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

# Load and preprocess data (polish raw)
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    # Clean: Remove NaNs, normalize (zero-mean, unit variance)
    df = df.dropna()
    for col in ['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    # Add label column if not present (manual in CSV)
    return df

# Feature extraction (transform to meaningful reps)
def extract_features(df, window_size=100, step=50):
    features = []
    labels = []
    for i in range(0, len(df) - window_size, step):
        window = df.iloc[i:i+window_size]
        row_features = []
        for col in ['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']:
            signal = window[col].values
            # Time-domain: Mean, std, min, max, skewness
            row_features.extend([np.mean(signal), np.std(signal), np.min(signal), np.max(signal), stats.skew(signal)])
            # Frequency-domain: Dominant freq (FFT)
            fft_vals = np.abs(fft.fft(signal))
            dominant_freq = np.argmax(fft_vals)
            row_features.append(dominant_freq)
        features.append(row_features)
        # Label: Mode of labels in window (assume 'label' column)
        labels.append(window['label'].mode()[0])
    return np.array(features), np.array(labels)

# Example: Load, preprocess, extract
df_pushup = preprocess_data('pushup_data.csv')
X_pushup, y_pushup = extract_features(df_pushup)

# Combine datasets (repeat for squat, plank)
# X = np.vstack([X_pushup, X_squat, X_plank])
# y = np.hstack([y_pushup, y_squat, y_plank])

# Train/test split (Week 8: k-fold CV for robustness)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models (KNN, RF per Report A)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Save models for Pi deployment (Week 6: Edge inference)
pickle.dump(rf, open('har_model.pkl', 'wb'))  # Use RF as primary