In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

def load_and_explore_pkl(filename):
    # Load the pickle file with latin1 encoding
    with open(filename, 'rb') as f:
        data = pickle.load(f, encoding='latin1')  # Use 'latin1' to avoid encoding issues
    
    # Print a summary of the loaded data
    if isinstance(data, dict):
        # print("Loaded data keys:")
        # print(data.keys())  # Print all keys
        
        # Print summary of each key
        for key in data:
            print(f"\nKey: {key}")
            if key == 'signal':
                print("  - signal keys:", data[key].keys())
                print("    - chest keys:", data[key]['chest'].keys())
                print("    - wrist keys:", data[key]['wrist'].keys())
            elif key == 'label':
                print("  - label shape:", data[key].shape)
            else:
                print(f"  - {key}:", data[key])
    else:
        print("Loaded data is not a dictionary")
    
    return data

# Load data
filename = "../S2/S2.pkl"  # Replace with the relative path to your actual .pkl filename
data = load_and_explore_pkl(filename)
# labels = data['label']




# # Access chest ACC data and labels
# chest_acc_data = data['signal']['chest']['ACC']

# # Check the shape of the data
# print("\nChest ACC data shape:")
# print(chest_acc_data.shape)
# print("\nLabels shape:")
# print(labels.shape)



Key: signal
  - signal keys: dict_keys(['chest', 'wrist'])
    - chest keys: dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
    - wrist keys: dict_keys(['ACC', 'BVP', 'EDA', 'TEMP'])

Key: label
  - label shape: (4255300,)

Key: subject
  - subject: S2


In [3]:
labels = data['label']
# Inspect the distribution of the original labels

unique_labels, counts = np.unique(labels, return_counts=True)
label_distribution = dict(zip(unique_labels, counts))

print("Original label distribution:", label_distribution)


Original label distribution: {0: 2142701, 1: 800800, 2: 430500, 3: 253400, 4: 537599, 6: 45500, 7: 44800}


In [4]:
import numpy as np

# Retrieve labels
labels = data['label']

# Define the labels to remove
labels_to_remove = {0, 4, 5, 6, 7}

# Get indices of the labels to keep
indices_to_keep = np.array([i for i, label in enumerate(labels) if label not in labels_to_remove], dtype=int)

# Function to filter signal data
def filter_signals(signal_data, indices):
    return signal_data[indices]

# Filter chest data
filtered_chest = {modality: filter_signals(np.array(data['signal']['chest'][modality]), indices_to_keep) for modality in data['signal']['chest']}

# Filter labels
filtered_labels = np.array(labels)[indices_to_keep]

# Truncate the filtered chest data and labels to ensure consistency
max_length = min(len(filtered_labels), len(filtered_chest['ACC']))

truncated_filtered_labels = filtered_labels[:max_length]
truncated_filtered_chest = {modality: filtered_chest[modality][:max_length] for modality in filtered_chest}

# Update data dictionary with filtered chest data and labels
filtered_data = {
    'signal': {
        'chest': truncated_filtered_chest
    },
    'label': truncated_filtered_labels
}

# Verify the lengths after filtering
print(f"Filtered labels length: {len(filtered_data['label'])}")
for modality in filtered_data['signal']['chest']:
    print(f"Filtered chest {modality} length: {len(filtered_data['signal']['chest'][modality])}")


Filtered labels length: 1484700
Filtered chest ACC length: 1484700
Filtered chest ECG length: 1484700
Filtered chest EMG length: 1484700
Filtered chest EDA length: 1484700
Filtered chest Temp length: 1484700
Filtered chest Resp length: 1484700


In [5]:
# Extract variables for each chest modality
acc_data = filtered_data['signal']['chest']['ACC']
ecg_data = filtered_data['signal']['chest']['ECG']
emg_data = filtered_data['signal']['chest']['EMG']
eda_data = filtered_data['signal']['chest']['EDA']
temp_data = filtered_data['signal']['chest']['Temp']
resp_data = filtered_data['signal']['chest']['Resp']

# # Print lengths to verify
# print(f"ACC data length: {len(acc_data)}")
# print(f"ECG data length: {len(ecg_data)}")
# print(f"EMG data length: {len(emg_data)}")
# print(f"EDA data length: {len(eda_data)}")
# print(f"TEMP data length: {len(temp_data)}")
# print(f"RESP data length: {len(resp_data)}")


In [6]:
# Define the number of samples to display per label
samples_per_label = 2

# Create a dictionary to store samples for each label
samples_dict = {1: [], 2: [], 3: []}

# Collect samples for each label
for i in range(len(filtered_data['label'])):
    label = filtered_data['label'][i]
    if label in samples_dict and len(samples_dict[label]) < samples_per_label:
        samples_dict[label].append(i)
    # Stop if we've collected enough samples for all labels
    if all(len(samples) >= samples_per_label for samples in samples_dict.values()):
        break

# Print sample data and labels
print(f"Sample data and labels:")

# Ensure we have the required samples for each label
for label in [1, 2, 3]:
    print(f"Label {label} samples:")
    indices = samples_dict[label]
    for idx in indices:
        print(f"  Sample Index: {idx}")
        print(f"    Label: {filtered_data['label'][idx]}")
        print(f"    ACC data: {acc_data[idx]}")
        print(f"    ECG data: {ecg_data[idx]}")
        print(f"    EMG data: {emg_data[idx]}")
        print(f"    EDA data: {eda_data[idx]}")
        print(f"    TEMP data: {temp_data[idx]}")
        print(f"    RESP data: {resp_data[idx]}")
        print()


Sample data and labels:
Label 1 samples:
  Sample Index: 0
    Label: 1
    ACC data: [ 0.89139998 -0.11019999 -0.25760001]
    ECG data: [0.03094482]
    EMG data: [-0.00370789]
    EDA data: [5.71098328]
    TEMP data: [29.083618]
    RESP data: [1.19171143]

  Sample Index: 1
    Label: 1
    ACC data: [ 0.89260006 -0.10860002 -0.25440001]
    ECG data: [0.03364563]
    EMG data: [-0.0141449]
    EDA data: [5.71937561]
    TEMP data: [29.122437]
    RESP data: [1.13983154]

Label 2 samples:
  Sample Index: 800800
    Label: 2
    ACC data: [ 0.87759995 -0.10299999 -0.29680002]
    ECG data: [-0.01167297]
    EMG data: [0.0050354]
    EDA data: [1.27830505]
    TEMP data: [31.21051]
    RESP data: [-1.222229]

  Sample Index: 800801
    Label: 2
    ACC data: [ 0.87580001 -0.10180002 -0.29519999]
    ECG data: [-0.0015564]
    EMG data: [0.00059509]
    EDA data: [1.25274658]
    TEMP data: [31.22229]
    RESP data: [-1.20239258]

Label 3 samples:
  Sample Index: 1231300
    Label: 3

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Assuming 'acc_data' and 'filtered_data['label']' are already defined

# Extract features and labels
X = np.array(acc_data)  # Features (ACC data)
y = np.array(filtered_data['label'])  # Labels

# Check dimensions and adjust if necessary
if X.ndim == 1:
    X = X.reshape(-1, 1)  # Reshape if ACC data is one-dimensional

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define SMOTE and the model
smote = SMOTE(sampling_strategy='auto', random_state=42)
model = LogisticRegression(max_iter=1000)

# Create a pipeline with SMOTE and Logistic Regression
pipeline = Pipeline(steps=[('smote', smote), ('model', model)])

# Train the model
pipeline.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           1       0.99      0.96      0.98    239861
           2       0.95      0.96      0.95    129025
           3       0.91      0.99      0.95     76524

    accuracy                           0.96    445410
   macro avg       0.95      0.97      0.96    445410
weighted avg       0.97      0.96      0.96    445410

Confusion Matrix:
[[230531   6206   3124]
 [  1220 123410   4395]
 [    34    762  75728]]
