In [1]:
import zipfile
import re
import os

def extract_frequency(base_dir, location, activities):
    acc_frequencies = []
    gyr_frequencies = []
    
    # Adjusted regex pattern to account for variability in formatting
    frequency_pattern = re.compile(rf'{location}\.csv.*?frequency:\s*(\d+\.\d+)\s*Hz', re.DOTALL)

    for i in range(1, 16):  # Assuming proband1 through proband15
        for activity in activities:
            # Paths to the accelerometer and gyroscope zip files for each activity
            acc_zip_path = os.path.join(base_dir, f'proband{i}', 'data', f'acc_{activity}_csv.zip')
            gyr_zip_path = os.path.join(base_dir, f'proband{i}', 'data', f'gyr_{activity}_csv.zip')

            # Extract and process accelerometer frequency
            if os.path.exists(acc_zip_path):
                with zipfile.ZipFile(acc_zip_path, 'r') as zip_ref:
                    if 'readMe' in zip_ref.namelist():
                        with zip_ref.open('readMe') as readme:
                            content = readme.read().decode()
                            match = frequency_pattern.search(content)
                            if match:
                                acc_frequencies.append(float(match.group(1)))
                                # print(f"Extracted ACC frequency for proband{i}, {activity}: {match.group(1)} Hz")
                            else:
                                print(f"Could not find ACC {location} frequency for proband{i}, {activity}.")

            # Extract and process gyroscope frequency
            if os.path.exists(gyr_zip_path):
                with zipfile.ZipFile(gyr_zip_path, 'r') as zip_ref:
                    if 'readMe' in zip_ref.namelist():
                        with zip_ref.open('readMe') as readme:
                            content = readme.read().decode()
                            match = frequency_pattern.search(content)
                            if match:
                                gyr_frequencies.append(float(match.group(1)))
                                # print(f"Extracted GYR frequency for proband{i}, {activity}: {match.group(1)} Hz")
                            else:
                                print(f"Could not find GYR {location} frequency for proband{i}, {activity}.")

    return acc_frequencies, gyr_frequencies

# Base directory containing the proband subdirectories
base_dir = 'RealWorld'

location = 'waist'
activities = ['walking', 'running', 'standing', 'climbingdown', 'climbingup']

# Extract frequencies
acc_frequencies, gyr_frequencies = extract_frequency(base_dir, location, activities)

print()
# print(f"Accelerometer Frequencies: {acc_frequencies}")
print(f"Max Accelerometer Frequency: {max(acc_frequencies)} Hz")
print(f"Min Accelerometer Frequency: {min(acc_frequencies)} Hz")
print()
# print(f"Gyroscope Frequencies: {gyr_frequencies}")
print(f"Max Gyroscope Frequency: {max(gyr_frequencies)} Hz")
print(f"Min Gyroscope Frequency: {min(gyr_frequencies)} Hz")


Max Accelerometer Frequency: 50.1 Hz
Min Accelerometer Frequency: 49.94 Hz

Max Gyroscope Frequency: 50.1 Hz
Min Gyroscope Frequency: 49.94 Hz


In [2]:
import os
import zipfile
import numpy as np
import pandas as pd

activity_full_names = {
    'WAL': 'walking',
    'RUN': 'running',
    'STN': 'standing',
    'CLD': 'climbingdown',
    'CLU': 'climbingup'
}

def extract_data(zip_path, activity_name, location):

    if not os.path.exists(zip_path):
        print(f"\t{zip_path} does not exist.")
        return None
    
    with zipfile.ZipFile(zip_path) as z:
        csv_file_name_1 = f'acc_{activity_name}_{location}.csv'
        csv_file_name_2 = f'acc_{activity_name}_2_{location}.csv'
        csv_file_name_3 = f'acc_{activity_name}_3_{location}.csv'
        if csv_file_name_1 in z.namelist():
            with z.open(csv_file_name_1) as f:
                df = pd.read_csv(f)
                # print(f'Extracted {csv_file_name_1} from {zip_path}.')
                return df
        elif csv_file_name_2 in z.namelist():
            with z.open(csv_file_name_2) as f:
                df = pd.read_csv(f)
                # print(f'Extracted {csv_file_name_2} from {zip_path}.')
                return df
        elif csv_file_name_3 in z.namelist():
            with z.open(csv_file_name_3) as f:
                df = pd.read_csv(f)
                # print(f'Extracted {csv_file_name_3} from {zip_path}.')
                return df
        
        # If the CSV isn't in the main ZIP, look for the second ZIP and access it directly
        inner_zip_file = f'acc_{activity_name}_1_csv.zip'
        if inner_zip_file in z.namelist():
            with z.open(inner_zip_file) as inner_z_file:
                with zipfile.ZipFile(inner_z_file) as inner_z:
                    if csv_file_name_1 in inner_z.namelist():
                        with inner_z.open(csv_file_name_1) as f:
                            df = pd.read_csv(f)
                            # print(f'Extracted {csv_file_name_1} from {inner_zip_file}.')
                            return df

    print(f"{csv_file_name_1} not found in {zip_path} or {inner_zip_file}.")
    return None 

def extract_all_data(activity_full_names, location):
    data_dict = {}
    for i in range(1, 16):
        subjectID = f'proband{i}'
        for activity_code, activity_name in activity_full_names.items():
            zip_path = os.path.join('RealWorld', subjectID, 'data', f'acc_{activity_name}_csv.zip')
            # print(f"Extracting data for {location}, {activity_code.upper()}, subject {subjectID}...")
            df = extract_data(zip_path, activity_name, location)
            if df is not None:
                data = np.array(df[['attr_x', 'attr_y', 'attr_z']].values)
                if activity_code not in data_dict:
                    data_dict[activity_code] = {}
                if i-1 not in data_dict[activity_code]:
                    data_dict[activity_code][i-1] = []
                data_dict[activity_code][i-1].append(data)
            else:
                print(f"\tData for {location}, {activity_code.upper()}, subject {subjectID} could not be fully extracted.")
    print("Data successfully extracted.")
    return data_dict

data_dict = extract_all_data(activity_full_names, location)

Data successfully extracted.


In [3]:
for activity in data_dict.keys():
    print(f"Activity: {activity}, Number of subjects: {len([data_dict[activity][i] for i in data_dict[activity].keys()])}")    

Activity: WAL, Number of subjects: 15
Activity: RUN, Number of subjects: 15
Activity: STN, Number of subjects: 15
Activity: CLD, Number of subjects: 15
Activity: CLU, Number of subjects: 15


In [4]:
for activity in data_dict.keys():
    for subject in data_dict[activity].keys():
        print(f"Activity: {activity}, Subject: {subject}, Shape: {data_dict[activity][subject][0].shape}")

Activity: WAL, Subject: 0, Shape: (31948, 3)
Activity: WAL, Subject: 1, Shape: (30747, 3)
Activity: WAL, Subject: 2, Shape: (34118, 3)
Activity: WAL, Subject: 3, Shape: (31031, 3)
Activity: WAL, Subject: 4, Shape: (34837, 3)
Activity: WAL, Subject: 5, Shape: (31561, 3)
Activity: WAL, Subject: 6, Shape: (30906, 3)
Activity: WAL, Subject: 7, Shape: (32909, 3)
Activity: WAL, Subject: 8, Shape: (31396, 3)
Activity: WAL, Subject: 9, Shape: (31770, 3)
Activity: WAL, Subject: 10, Shape: (33299, 3)
Activity: WAL, Subject: 11, Shape: (31750, 3)
Activity: WAL, Subject: 12, Shape: (32794, 3)
Activity: WAL, Subject: 13, Shape: (33578, 3)
Activity: WAL, Subject: 14, Shape: (33218, 3)
Activity: RUN, Subject: 0, Shape: (30628, 3)
Activity: RUN, Subject: 1, Shape: (30654, 3)
Activity: RUN, Subject: 2, Shape: (37939, 3)
Activity: RUN, Subject: 3, Shape: (52178, 3)
Activity: RUN, Subject: 4, Shape: (55648, 3)
Activity: RUN, Subject: 5, Shape: (33268, 3)
Activity: RUN, Subject: 6, Shape: (36775, 3)
Activ

In [5]:
import matplotlib.pyplot as plt

def plot_time_series(data_dict, activity, subject):
    data_list = data_dict[activity][subject]
    for i, data in enumerate(data_list):
        plt.figure(figsize=(30, 2))
        plt.plot(data, linewidth=0.5)
        plt.title(f'Activity: {activity}, Subject: {subject}, Observation: {i}')
        plt.show()

# for activity in data_dict.keys():
    # plot_time_series(data_dict, activity, 0)
    # plot_time_series(data_dict, activity, 2)

In [6]:
def flip_ts(data_dict, subject, activity):
    ts = data_dict[activity][subject][0]
    data_dict[activity][subject][0] = -ts

flip_ts(data_dict, 2, 'WAL')
flip_ts(data_dict, 2, 'RUN')
flip_ts(data_dict, 2, 'STN')
flip_ts(data_dict, 2, 'CLD')
flip_ts(data_dict, 2, 'CLU')
flip_ts(data_dict, 7, 'CLU')

# for activity in data_dict.keys():
    # plot_time_series(data_dict, activity, 0)
    # plot_time_series(data_dict, activity, 2)

In [11]:
import numpy as np

seed = 2710
np.random.seed(seed)

def random_rotation_matrix():
    # Generate random angles in radians
    alpha = np.random.uniform(0, 2 * np.pi)
    beta = np.random.uniform(0, 2 * np.pi)
    gamma = np.random.uniform(0, 2 * np.pi)
    
    # Rotation matrix around x-axis
    R_x = np.array([[1, 0, 0],
                    [0, np.cos(alpha), -np.sin(alpha)],
                    [0, np.sin(alpha), np.cos(alpha)]])
    
    # Rotation matrix around y-axis
    R_y = np.array([[np.cos(beta), 0, np.sin(beta)],
                    [0, 1, 0],
                    [-np.sin(beta), 0, np.cos(beta)]])
    
    # Rotation matrix around z-axis
    R_z = np.array([[np.cos(gamma), -np.sin(gamma), 0],
                    [np.sin(gamma), np.cos(gamma), 0],
                    [0, 0, 1]])
    
    # Combined rotation matrix
    R = np.dot(R_z, np.dot(R_y, R_x))
    return R


def augment_data(data_dict, n_augmentations=1):
    augmented_data_dict = {}
    for activity in data_dict.keys():
        augmented_data_dict[activity] = {}
        for subject in data_dict[activity].keys():
            augmented_data_dict[activity][subject] = [data_dict[activity][subject][0]]
            for ts in data_dict[activity][subject]:
                for i in range(n_augmentations):
                    R = random_rotation_matrix()
                    augmented_ts = np.dot(ts, R)
                    augmented_data_dict[activity][subject].append(augmented_ts)
    return augmented_data_dict


augmented_data_dict = augment_data(data_dict, n_augmentations=0)

# for activity in augmented_data_dict.keys():
#     plot_time_series(augmented_data_dict, activity, 0)
    # plot_time_series(augmented_data_dict, activity, 2)

In [12]:
def minmax_normalize(data_dict):
    # Concatenate all data points to compute global max and min
    all_data = np.concatenate([np.concatenate(data_dict[activity][subject]) for activity in data_dict.keys() for subject in data_dict[activity].keys()])
    global_max = all_data.max()
    global_min = all_data.min()

    print(f"Global Max: {global_max:.2f}, Global Min: {global_min:.2f}")
    
    def normalize(data):
        return (data - global_min) / (global_max - global_min)
    
    normalized_data_dict = {}
    for activity in data_dict.keys():
        normalized_data_dict[activity] = {}
        for subject in data_dict[activity].keys():
            normalized_data_dict[activity][subject] = []
            for ts in data_dict[activity][subject]:
                normalized_data_dict[activity][subject].append(normalize(ts))
    return normalized_data_dict

normalized_data_dict = minmax_normalize(augmented_data_dict)

# for activity in normalized_data_dict.keys():
#     plot_time_series(normalized_data_dict, activity, 0)
#     plot_time_series(normalized_data_dict, activity, 2)

Global Max: 19.61, Global Min: -19.61


In [None]:
trim_dict = {
    'WAL': (600, 1000),
    'RUN': (750, 1500),
    'STN': (0, 1),
    'CLD': (1200, 750),
    'CLU': (1400, 1400),
    # 'JMP': (350, 500),
    # 'LYI': (1500, 1700),
    # 'SIT': (1750, 1250),
}

def trim_time_series(data_dict, trim_dict):
    new_data_dict = {}
    for activity in data_dict.keys():
        new_data_dict[activity] = {}
        for subject in data_dict[activity].keys():
            new_data_dict[activity][subject] = []
            start, end = trim_dict[activity]
            for ts in data_dict[activity][subject]:
                new_data_dict[activity][subject].append(ts[start:-end])

    return new_data_dict

trim_data_dict = trim_time_series(normalized_data_dict, trim_dict)

# for activity in trim_data_dict.keys():
#     plot_time_series(trim_data_dict, activity, 0)
#     plot_time_series(trim_data_dict, activity, 2)

In [None]:
def trim_time_series_more(data_dict, trim_dict):
    new_data_dict = {}

    for activity in data_dict.keys():
        new_data_dict[activity] = {}
        for subject in data_dict[activity].keys():
            trim_ranges = trim_dict[activity][subject]
            # Sort trim ranges by the first element of each tuple
            trim_ranges.sort(key=lambda x: x[0])
            
            # Initialize the start index of the time series to be trimmed
            start_idx = 0
            new_series_list = []

            for ts in data_dict[activity][subject]:
            
                for (x1, x2) in trim_ranges:
                    # Append the time series from the last cut to the beginning of the current cut
                    if x1 > start_idx:
                        new_series_list.append(ts[start_idx:x1])
                    # Update the start index to the end of the current cut for the next iteration
                    start_idx = x2

                # After all cuts, if there's any remaining time series, append it
                if start_idx < len(ts):
                    new_series_list.append(ts[start_idx:])

                # Update the start index for the next time series
                start_idx = 0

            new_data_dict[activity][subject] = new_series_list

    return new_data_dict


trim_dict = {
    'WAL': [[] for _ in range(15)],
    'STN': [[] for _ in range(15)],
    'RUN': [[],
            [],
            [(11000, 19000)],
            [(8000, 22500), (25900, 32500), (35000, 46000)],
            [(9000, 11500), (17500, 28000), (35000, 49000)],
            [(11500, 14000)],
            [(19000, 23000)],
            [],
            [(5000, 16000)],
            [],
            [],
            [],
            [],
            [],
            [(2000, 3000), (27500, 32000)]],
    'CLD': [[] for _ in range(3)] + [[(4400, 4900)]] + [[] for _ in range(11)],
    'CLU': [[] for _ in range(7)] + [[(23000, 43000)]] + [[] for _ in range(7)],
    # 'JMP': [[] for _ in range(15)],
    # 'LYI': [[] for _ in range(15)],
    # 'SIT': [[] for _ in range(15)],
}

trimmore_data_dict = trim_time_series_more(trim_data_dict, trim_dict)

# for activity in trimmore_data_dict.keys():
#     plot_time_series(trimmore_data_dict, activity, 0)
#     plot_time_series(trimmore_data_dict, activity, 2)

In [None]:
for activity in trimmore_data_dict.keys():
    for subject in trimmore_data_dict[activity].keys():
        print(f"Activity: {activity}, Subject: {subject}, Number of Observations: {len(trimmore_data_dict[activity][subject])}")

In [None]:
def split_time_series(data_dict):
    # Fixed length for splitting time series
    split_length = 128

    split_data_dict = {}
    for activity in data_dict.keys():
        split_data_dict[activity] = []
        for subject in data_dict[activity]:
            subject_ts = data_dict[activity][subject]
            split_subject = []
            for part in subject_ts:
                part_length = len(part)
                num_windows = part_length // split_length
                for i in range(num_windows):
                    split_subject.append(part[i*split_length:(i+1)*split_length])
            split_data_dict[activity].append(split_subject)

    return split_data_dict

split_data_dict = split_time_series(trimmore_data_dict)

In [None]:
for activity in split_data_dict.keys():
    for i, subject in enumerate(split_data_dict[activity]):
        print(f"Activity: {activity}, Subject: {i+1}, Number of time series: {len(subject)}")

In [None]:
def plot_data(data, activity):
    fig, axs = plt.subplots(6, 15, figsize=(30, 8))
    fig.suptitle(f'Activity: {activity_full_names[activity]}', fontsize=16)
    for i in range(15):
        step = len(data[activity][i]) // 6
        for j in range(6):
            axs[j, i].plot(data[activity][i][j*step], linewidth=0.5)
            axs[j, i].get_xaxis().set_visible(False)
            axs[j, i].get_yaxis().set_visible(False)
            axs[j, i].set_ylim(0, 1)
            axs[j, i].set_title(f'Subject: {i}, Part: {j*step}')
    plt.tight_layout()
    plt.show()

plot_data(split_data_dict, 'WAL')
plot_data(split_data_dict, 'RUN')
plot_data(split_data_dict, 'STN')
plot_data(split_data_dict, 'CLD')
plot_data(split_data_dict, 'CLU')
# plot_data(data_dict, 'JMP')
# plot_data(data_dict, 'LYI')
# plot_data(data_dict, 'SIT')

In [None]:
def create_dataset(data, activities=['WAL', 'RUN', 'STN', 'CLD', 'CLU']):
    x = []
    y = []
    k = []

    act_idx = 0
    for activity in data.keys():
        if activity not in activities:
            continue
        for sub_idx, subject in enumerate(data[activity]):
            for part in subject:
                x.append(part)
                y.append(act_idx)
                k.append(sub_idx)
        act_idx += 1

    return np.array(x).transpose(0,2,1), np.array(y), np.array(k)

x, y, k = create_dataset(split_data_dict, activities=['WAL', 'RUN', 'CLD', 'CLU'])
print(f"X shape: {x.shape}, Y shape: {y.shape}, K shape: {k.shape}")

In [None]:
import numpy as np
import random

# Ensure reproducibility
random.seed(0)
np.random.seed(0)

# Initialize fs vector with zeros
fs = np.zeros_like(k)

# Iterate through each unique combination of subject and activity
for subject in np.unique(k):
    for activity in np.unique(y):
        # Find indices matching the current combination
        indices = np.where((k == subject) & (y == activity))[0]

        # If there are at least 5 samples, randomly select 5
        if len(indices) >= 5:
            selected_indices = np.random.choice(indices, 5, replace=False)
            fs[selected_indices] = 1

# Verify the result
print("Number of 1s in fs:", np.sum(fs))


In [None]:
# import pickle

# # Save the data to a pickle file
# with open('realworld_znorm.pkl', 'wb') as f:
#     pickle.dump((x, y, k), f)

# # Save the few-shot vector to a pickle file
# with open('realworld_znorm_fs.pkl', 'wb') as f:
#     pickle.dump(fs, f)