In [3]:
import numpy as np
import os
import pandas as pd
def load_smpl_data(person_id, npz_root, walk_df):
    features = []
    
    # Filter the walks for this person
    person_walks = walk_df[walk_df['ID'] == person_id]

    for _, row in person_walks.iterrows():
        file_path = os.path.join(npz_root, row['file_path'])
        if not os.path.exists(file_path):
            continue
        try:
            data = np.load(file_path)
            frame_feats = []

            if 'pose' in data:
                frame_feats.append(data['pose'].mean(axis=0) if data['pose'].ndim > 1 else data['pose'])  # (72,)
            if 'shape' in data:
                frame_feats.append(data['shape'].mean(axis=0) if data['shape'].ndim > 1 else data['shape'])  # (10,)
            if 'global_t' in data:
                frame_feats.append(data['global_t'].mean(axis=0))  # (3,)
            if 'focal_l' in data:
                frame_feats.append([np.mean(data['focal_l'])])  # scalar
            if 'pred_joints' in data:
                frame_feats.append(data['pred_joints'].mean(axis=(0, 1)))  # (3,)

            # Optionally add metadata like viewpoint, variation
            frame_feats.append([row['viewpoint']])
            frame_feats.append([hash(row['variation']) % 1000])  # or use OneHotEncoding if needed

            features.append(np.concatenate(frame_feats))

        except Exception as e:
            print(f"Failed to process {file_path}: {e}")
            continue

    if len(features) == 0:
        raise ValueError(f"No valid files found for ID {person_id}")

    return np.mean(features, axis=0)  # aggregate over all variations


In [4]:
# Load walk metadata from CSV
def load_walk_metadata(walk_csv):
    walk_df = pd.read_csv(walk_csv)
    walk_df['file_path'] = walk_df.apply(lambda row: f"{row['ID']}/{row['file_id'].replace(':', '_')}.npz", axis=1)
    return walk_df

# Load GHQ labels from CSV
def load_ghq_labels(csv_file):
    ghq_data = pd.read_csv(csv_file)
    return ghq_data.set_index('ID')['GHQ_Label'].to_dict()  # Create a mapping from person_id to ghq_label

In [5]:
def process_data(npz_root, ghq_csv, walk_csv):
    walk_df = load_walk_metadata(walk_csv)
    ghq_labels = load_ghq_labels(ghq_csv)

    all_features, all_labels = [], []

    for person_id in ghq_labels.keys():
        try:
            feats = load_smpl_data(person_id, npz_root, walk_df)
            all_features.append(feats)
            all_labels.append(ghq_labels[person_id])
        except Exception as e:
            print(f"Skipping ID {person_id}: {e}")

    return np.array(all_features), np.array(all_labels)

In [6]:
# Example usage
walk_df = 'walks.csv'
npz_folder = 'smpl'  # Replace with the path to your npz folder
ghq_csv = 'subset.csv'  # Replace with the path to your GHQ labels CSV

In [7]:
features, labels = process_data(npz_root="smpl", ghq_csv="subset.csv", walk_csv="walks.csv")

In [8]:
from sklearn.preprocessing import LabelEncoder

# Assuming labels are like ['Major Distress', 'Typical', ...]
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

# Now use `labels_encoded` in model training instead of `labels`


RANDOM forest

In [30]:

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV



# Main function to train and evaluate the model
def train_and_evaluate(npz_folder, ghq_csv,walk_df):
    # Process the data
    features, labels = process_data(npz_folder, ghq_csv,walk_df)
    
    # Standardize the features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.3, random_state=280, stratify=labels)

    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))


train_and_evaluate(npz_folder, ghq_csv, walk_df)


Classification Report:
                 precision    recall  f1-score   support

Major Distress       0.00      0.00      0.00        11
Minor Distress       0.83      0.25      0.38        20
       Typical       0.72      1.00      0.83        63

      accuracy                           0.72        94
     macro avg       0.52      0.42      0.41        94
  weighted avg       0.66      0.72      0.64        94



Random is
Strengths: The model does well at identifying "Typical" distress, making it effective for the more frequent class.
Weaknesses: The model fails to identify "Major Distress" and has significant trouble with "Minor Distress." These issues are likely due to class imbalance, insufficient training data for certain classes, and/or overfitting on the dominant class ("Typical").

basic neural network

In [9]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

# 1. Neural Network Model (Example)
# Build the neural network model
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))  # First layer with input shape
    model.add(Dense(32, activation='relu'))  # Hidden layer
    model.add(Dense(3, activation='softmax'))  # Output layer (assuming 3 classes in the target)

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model



In [45]:
def train_evaluate_model2(features, labels):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=280)
    
    # Normalize the features (important for neural networks)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Build and train the neural network
    model = build_nn_model(X_train.shape[1])  # Input dimension based on features
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    print(classification_report(y_test, np.argmax(y_pred, axis=1)))

# Train and evaluate using the processed data

train_evaluate_model2(features, labels_encoded)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.4854 - loss: 1.0386 - val_accuracy: 0.6984 - val_loss: 0.7821
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6554 - loss: 0.7931 - val_accuracy: 0.7619 - val_loss: 0.7357
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6652 - loss: 0.7705 - val_accuracy: 0.7302 - val_loss: 0.7323
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6649 - loss: 0.7425 - val_accuracy: 0.7302 - val_loss: 0.7208
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6966 - loss: 0.7047 - val_accuracy: 0.7143 - val_loss: 0.6982
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7005 - loss: 0.6546 - val_accuracy: 0.7302 - val_loss: 0.6922
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

 1. Gradient Boosting (XGBoost)

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def try_xgboost(features, labels):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.2, random_state=280)

    model = XGBClassifier( eval_metric='mlogloss')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("XGBoost Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

    return model, scaler


model, scaler = try_xgboost(features, labels_encoded)

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       1.00      0.18      0.31        11
           2       0.77      0.98      0.86        47

    accuracy                           0.76        63
   macro avg       0.59      0.39      0.39        63
weighted avg       0.75      0.76      0.70        63



In [10]:
import numpy as np
import os

def extract_smpl_features(npz_path, viewpoint=0, variation_hash=0):
    """
    Load a single .npz and turn it into the same feature‐vector you used for training.
    If you don’t have viewpoint/variation info, you can default them (e.g. 0).
    """
    if not os.path.exists(npz_path):
        raise FileNotFoundError(f"{npz_path} not found")

    data = np.load(npz_path)
    feats = []

    # pose: (72,)
    if 'pose' in data:
        p = data['pose']
        feats.append(p.mean(axis=0) if p.ndim > 1 else p)

    # shape: (10,)
    if 'shape' in data:
        s = data['shape']
        feats.append(s.mean(axis=0) if s.ndim > 1 else s)

    # global translation: (3,)
    if 'global_t' in data:
        feats.append(data['global_t'].mean(axis=0))

    # focal length (scalar)
    if 'focal_l' in data:
        feats.append([np.mean(data['focal_l'])])

    # predicted joints: flatten mean over frames & joints
    if 'pred_joints' in data:
        # data['pred_joints'] shape e.g. (num_frames, num_joints, 3)
        feats.append(data['pred_joints'].mean(axis=(0,1)))

    # add—or default—your “metadata” slots:
    feats.append([viewpoint])
    feats.append([variation_hash])

    # final 1D vector
    return np.concatenate(feats)


In [16]:
from joblib import dump

# after `model.fit(...)` and `le.fit(...)` in your training code:
dump(scaler, 'scaler.joblib')
dump(le,       'label_encoder.joblib')
dump(model,    'xgb_model.joblib')


['xgb_model.joblib']

ModuleNotFoundError: No module named 'your_feature_module'

2. Support Vector Machine (SVM)

In [43]:
from sklearn.svm import SVC

def try_svm(features, labels):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.2, random_state=280)

    model = SVC(kernel='rbf', probability=True)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("SVM Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

try_svm(features, labels_encoded)

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00        11
           2       0.75      1.00      0.85        47

    accuracy                           0.75        63
   macro avg       0.25      0.33      0.28        63
weighted avg       0.56      0.75      0.64        63



3. Logistic Regression (Multiclass Softmax)

In [34]:
from sklearn.linear_model import LogisticRegression

def try_logistic_regression(features, labels):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.2, random_state=280)

    model = LogisticRegression(max_iter=1000, multi_class='multinomial')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Logistic Regression Report:\n", classification_report(y_test, y_pred, zero_division=0))

try_logistic_regression(features, labels_encoded)

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.20      0.09      0.12        11
           2       0.74      0.91      0.82        47

    accuracy                           0.70        63
   macro avg       0.31      0.34      0.31        63
weighted avg       0.59      0.70      0.63        63





In [35]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

def train_ensemble_model(features, labels):
    # Encode string labels to integers
    le = LabelEncoder()
    y_encoded = le.fit_transform(labels)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=280)

    # Define base models
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    svc = SVC(kernel='rbf', probability=True)

    # Combine them using a voting classifier (soft voting is better for probabilities)
    ensemble = VotingClassifier(estimators=[
        ('rf', rf),
        ('svc', svc)
    ], voting='soft')  # Use 'hard' for majority voting

    # Train ensemble
    ensemble.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = ensemble.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

train_ensemble_model(features, labels)

Classification Report:
                 precision    recall  f1-score   support

Major Distress       0.00      0.00      0.00         5
Minor Distress       0.00      0.00      0.00        11
       Typical       0.75      1.00      0.85        47

      accuracy                           0.75        63
     macro avg       0.25      0.33      0.28        63
  weighted avg       0.56      0.75      0.64        63



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
