In [9]:
import pandas as pd

# Load the walks.csv file
walk_data = pd.read_csv("walks.csv")

# Display first few rows
print(walk_data.head())


   ID  viewpoint variation  direction  run         file_id
0   0          0        bg          0    0   000:00:0:0:bg
1   0         45        nm          1    0   000:45:1:0:nm
2   0         45        nm          1    1   000:45:1:1:nm
3   0         45        ph          1    0   000:45:1:0:ph
4   0         45       txt          1    0  000:45:1:0:txt


In [10]:
import json
import os
import pandas as pd

def load_skeleton_data_with_labels(id, metadata_csv_path, base_path=r'C:\Users\ASUS\Downloads\semantic_data\skeletons'):
    # Load the metadata CSV to get the labels for each ID
    metadata = pd.read_csv(metadata_csv_path)
    
    # Find the label corresponding to the given ID
    label_mapping = metadata[metadata['ID'] == id]  # Assuming your CSV has an 'ID' column
    if label_mapping.empty:
        print(f"Label for ID {id} not found in metadata.")
        return [], []  # No data for this ID
    
    # Use the correct column name for labels, which is 'GHQ_Label'
    label = label_mapping.iloc[0]['GHQ_Label']  # Access the 'GHQ_Label' column
    
    # Path to the folder corresponding to the individual ID
    skeleton_folder = os.path.join(base_path, str(id))
    
    skeleton_data = []
    labels = []  # To store corresponding labels for the walks

    # Check if the folder exists
    if not os.path.exists(skeleton_folder):
        print(f"Folder for ID {id} does not exist.")
        return skeleton_data, labels

    # Loop through all files in the ID folder
    for file_name in os.listdir(skeleton_folder):
        # Only consider files ending in '.json' (your skeleton data files)
        if file_name.endswith('.json'):
            json_file_path = os.path.join(skeleton_folder, file_name)
            
            # Load the JSON data
            with open(json_file_path, 'r') as f:
                data = json.load(f)
                
                # Print the structure of the data to understand it
                print(f"Data structure for {json_file_path}:")
                print(data)  # Print the structure of the JSON data

                # Check if it's a list (based on the error message)
                if isinstance(data, list):
                    keypoints = data  # If it's already a list of keypoints, no need for .get
                else:
                    keypoints = data.get('keypoints')  # Extract keypoints if data is a dictionary
                
                if keypoints is not None:
                    skeleton_data.append(keypoints)
                    labels.append(label)  # Use the label from metadata

    return skeleton_data, labels

# Example: Load skeleton data for ID 1 with the CSV file path
id = 1
metadata_csv_path = "subset.csv"  # Provide the path to your metadata CSV
skeleton_data, labels = load_skeleton_data_with_labels(id, metadata_csv_path)

print(f"Loaded {len(skeleton_data)} skeletons for ID {id}.")
print("Labels:", set(labels))  # Show the unique labels


Folder for ID 1 does not exist.
Loaded 0 skeletons for ID 1.
Labels: set()


In [11]:
import pandas as pd

def prepare_labels(metadata_csv_path):
    """
    Loads the metadata CSV and prepares the labels mapping for the dataset.
    
    Parameters:
    - metadata_csv_path: Path to the CSV file containing the GHQ labels.
    
    Returns:
    - label_map: Dictionary mapping each ID to its respective label (Typical, Major Distress, Minor Distress).
    """
    # Load metadata CSV (assuming it has columns 'ID' and 'GHQ_Label')
    metadata_df = pd.read_csv(metadata_csv_path)
    
    # Map labels to numeric classes: 'Typical' -> 0, 'Minor Distress' -> 1, 'Major Distress' -> 2
    label_mapping = {'Typical': 0, 'Minor Distress': 1, 'Major Distress': 2}
    
    # Map each ID to its corresponding label
    label_map = {row['ID']: label_mapping[row['GHQ_Label']] for _, row in metadata_df.iterrows()}
    
    return label_map

# Example: Prepare labels from your metadata CSV
metadata_csv_path = "subset.csv"  # Provide the correct path to your CSV
label_map = prepare_labels(metadata_csv_path)

# Print the first 5 ID-label mappings to verify
print(list(label_map.items())[:5])


[(0, 0), (1, 2), (2, 0), (3, 1), (4, 0)]


In [12]:
import json
import os

def load_skeleton_data_with_labels_and_metadata(id, base_path=r'skeletons', label_map=None):
    """
    Load the skeleton data for a specific ID, and associate it with labels from the metadata.
    
    Parameters:
    - id: ID of the individual whose skeleton data is being loaded.
    - base_path: Path to the folder containing the skeleton data.
    - label_map: A dictionary mapping IDs to their corresponding GHQ labels.
    
    Returns:
    - skeleton_data: List of keypoints data for the skeletons.
    - labels: Corresponding list of labels for the skeletons.
    """
    # Path to the folder corresponding to the individual ID
    skeleton_folder = os.path.join(base_path, str(id))
    
    skeleton_data = []
    labels = []  # To store corresponding labels for the skeletons
    
    # Get the label for this ID from the label_map
    label = label_map.get(id, None)
    
    if label is not None:
        # Loop through all JSON files in the labeled folders
        for file_name in os.listdir(skeleton_folder):
            if file_name.endswith('.json'):
                json_file_path = os.path.join(skeleton_folder, file_name)
                
                # Load the JSON data
                with open(json_file_path, 'r') as f:
                    data = json.load(f)
                    
                    # If the data is a list, loop through and extract keypoints
                    if isinstance(data, list):
                        for item in data:
                            keypoints = item.get('keypoints')  # Extract keypoints
                            if keypoints is not None:
                                skeleton_data.append(keypoints)
                                labels.append(label)  # Store the label corresponding to this skeleton data
                    else:
                        # Assuming the data is a dictionary, extract keypoints directly
                        keypoints = data.get('keypoints')
                        if keypoints is not None:
                            skeleton_data.append(keypoints)
                            labels.append(label)  # Store the label corresponding to this skeleton data

    return skeleton_data, labels

# Example: Load skeleton data for ID 1 and associate it with its label
id = 1
skeleton_data, labels = load_skeleton_data_with_labels_and_metadata(id, label_map=label_map)

print(f"Loaded {len(skeleton_data)} skeletons for ID {id}.")
print("Labels:", set(labels))  # Show the unique labels (e.g., "Minor Distress", "Major Distress") 


Loaded 3193 skeletons for ID 1.
Labels: {2}


In [14]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

def load_skeleton_data(id, metadata_csv_path, base_path):
    metadata = pd.read_csv(metadata_csv_path)
    label_row = metadata[metadata['ID'] == id]

    if label_row.empty:
        return [], []

    label = label_row.iloc[0]['GHQ_Label']
    skeleton_folder = os.path.join(base_path, str(id))

    if not os.path.exists(skeleton_folder):
        return [], []

    skeleton_data, labels = [], []
    json_files = [f for f in os.listdir(skeleton_folder) if f.endswith('.json')]

    for file_name in tqdm(json_files, desc=f"Loading ID {id}", leave=False):
        json_file_path = os.path.join(skeleton_folder, file_name)
        try:
            with open(json_file_path, 'r') as f:
                data = json.load(f)
                if isinstance(data, list):
                    for item in data:
                        keypoints = item.get('keypoints')
                        if keypoints:
                            skeleton_data.append(keypoints)
                            labels.append(label)
                else:
                    keypoints = data.get('keypoints')
                    if keypoints:
                        skeleton_data.append(keypoints)
                        labels.append(label)
        except json.JSONDecodeError:
            continue

    return skeleton_data, labels

def prepare_data(metadata_csv_path, base_path):
    metadata = pd.read_csv(metadata_csv_path)
    all_data, all_labels = [], []

    print("\n🔄 Loading and preparing skeleton data...")
    for id in tqdm(metadata['ID'].unique(), desc="Processing IDs"):
        data, labels = load_skeleton_data(id, metadata_csv_path, base_path)
        all_data.extend(data)
        all_labels.extend(labels)

    # Flatten each keypoint set
    all_data = [np.array(keypoints).flatten() for keypoints in all_data]
    return np.array(all_data), np.array(all_labels)

def train_and_test_random_forest(metadata_csv_path, base_path):
    X, y = prepare_data(metadata_csv_path, base_path)

    if len(X) == 0:
        print("No data loaded. Exiting.")
        return

    print(f"\n📊 Total samples: {len(X)} | Feature dimension: {X.shape[1]}")
    print("📦 Splitting data into train/test...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    print("\n🌲 Training Random Forest...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    for _ in tqdm(range(1), desc="Training Progress"):
        model.fit(X_train, y_train)

    print("\n🧪 Evaluating model...")
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"\n✅ Accuracy: {acc:.4f}")
    print("\n📋 Classification Report:")
    print(classification_report(y_test, y_pred))

# === Run Training ===
metadata_csv_path = "subset.csv"
base_path = "skeletons"

train_and_test_random_forest(metadata_csv_path, base_path)
import json
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

def load_skeleton_data(id, metadata_csv_path, base_path, max_files=50):
    metadata = pd.read_csv(metadata_csv_path)
    label_row = metadata[metadata['ID'] == id]

    if label_row.empty:
        return [], []

    label = label_row.iloc[0]['GHQ_Label']
    skeleton_folder = os.path.join(base_path, str(id))

    if not os.path.exists(skeleton_folder):
        return [], []

    skeleton_data, labels = [], []
    json_files = [f for f in os.listdir(skeleton_folder) if f.endswith('.json')]

    for file_name in json_files[:max_files]:  # limit number of files per ID
        json_file_path = os.path.join(skeleton_folder, file_name)
        try:
            with open(json_file_path, 'r') as f:
                data = json.load(f)
                if isinstance(data, list):
                    for item in data:
                        keypoints = item.get('keypoints')
                        if keypoints:
                            skeleton_data.append(keypoints)
                            labels.append(label)
                else:
                    keypoints = data.get('keypoints')
                    if keypoints:
                        skeleton_data.append(keypoints)
                        labels.append(label)
        except (json.JSONDecodeError, UnicodeDecodeError):
            print(f"⚠️ Skipping corrupted file: {json_file_path}")
            continue
        except Exception as e:
            print(f"❌ Error reading {json_file_path}: {e}")
            continue

    return skeleton_data, labels

def prepare_data(metadata_csv_path, base_path):
    metadata = pd.read_csv(metadata_csv_path)
    all_data, all_labels = [], []

    print("\n🔄 Loading skeleton data for all IDs...")
    for id in tqdm(metadata['ID'].unique(), desc="Processing IDs"):
        data, labels = load_skeleton_data(id, metadata_csv_path, base_path)
        all_data.extend(data)
        all_labels.extend(labels)
        print(f"✅ ID {id}: Loaded {len(data)} samples")

    if not all_data:
        print("🚫 No data loaded at all.")
        return np.array([]), np.array([])

    all_data = [np.array(kps).flatten() for kps in all_data]
    return np.array(all_data), np.array(all_labels)

def train_and_test_random_forest(metadata_csv_path, base_path):
    X, y = prepare_data(metadata_csv_path, base_path)

    if len(X) == 0:
        print("❌ No data to train on. Exiting.")
        return

    print(f"\n📊 Total samples: {len(X)} | Feature dimension: {X.shape[1]}")
    print("📦 Splitting data into train/test...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    print("\n🌲 Training Random Forest...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    print("\n🧪 Evaluating model...")
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"\n✅ Accuracy: {acc:.4f}")
    print("\n📋 Classification Report:")
    print(classification_report(y_test, y_pred))

# === Run Training ===
metadata_csv_path = "subset.csv"
base_path = "skeletons"

train_and_test_random_forest(metadata_csv_path, base_path)



🔄 Loading and preparing skeleton data...


Processing IDs: 100%|██████████| 312/312 [00:48<00:00,  6.48it/s]



📊 Total samples: 1066612 | Feature dimension: 51
📦 Splitting data into train/test...

🌲 Training Random Forest...


Training Progress: 100%|██████████| 1/1 [24:12<00:00, 1452.80s/it]



🧪 Evaluating model...

✅ Accuracy: 0.7317

📋 Classification Report:
                precision    recall  f1-score   support

Major Distress       0.94      0.08      0.14     23782
Minor Distress       0.81      0.26      0.39     44996
       Typical       0.72      0.99      0.84    144545

      accuracy                           0.73    213323
     macro avg       0.82      0.44      0.46    213323
  weighted avg       0.77      0.73      0.66    213323



FileNotFoundError: [Errno 2] No such file or directory: 'D:/download/subset.csv'