In [1]:
import os
import json
import numpy as np
from tqdm import tqdm

In [2]:
def sensor_subsampled_string(data, n=20):
    if len(data)/n>10:
        print(f"High compression: {len(data)/n}")
    indices = np.round(np.linspace(0, len(data) - 1, n)).astype(int)
    return str([data[idx] for idx in indices])

In [3]:
from openai import OpenAI
import json


client = OpenAI()


def limubert_sample_to_sensorcaps(summary, accl_str, gyro_str):
    messages = [
        {
            "role": "system",
            "content": (
                "Please consider yourself to be an expert on gyroscope and accelerometer sensor "
                "information given as a metadata of IMU datasets."
                "You are given the IMU sensor readings of a human activity. "
                "The user also provides a brief summary of the event followed by 'Summary:'. "
                "They also give you gyroscopic and accelerometer sensor data followed by "
                "'Gyroscope:' and 'Accelerometer:' respectively. "
                "They are written in a Python list of lists format and contain x, y, and z "
                "axis data respectively. "
                "You should provide a comprehensive details of what the characteristic IMU "
                "features for that event would be within 10 words, followed by 'Features:'."
                "Then, narrate the temporal event with details that are context-aware "
                "based on the sensor data, followed by 'Narration:', in a step-by-step "
                "fashion, analyzing it within 150 words or less."
            )
        },
        {
            "role": "user",
            "content": (
                f"Summary: {summary}, "
                f"Gyroscope: {gyro_str} "
                f"Accelerometer: {accl_str}"
            ),
        },
    ]
    params = {
        "model": "gpt-3.5-turbo",
        "messages": messages,
        "max_tokens": 300,
    }

    result = client.chat.completions.create(**params)
    narration = result.choices[0].message.content
    messages.append({"role": "assistant", "content": narration})
    sensorcaps_sample = json.dumps({"messages": messages})
    return sensorcaps_sample

In [4]:
import random


SAMPLES_PER_DATASET = 25

root_data_dir = "/hdd/LLM/limuBERT_data/extracted_data"
datasets = sorted(os.listdir(root_data_dir))

data_file_name = "data_20_120.npy"
label_file_name = "label_20_120.npy"


with open('/hdd/LLM/limuBERT_data/dataset_activity_label.json') as json_file:
    dataset_activity_label_dict = json.load(json_file)


last_axis_dict = {
    "hhar": 2, "motion":0, "shoaib":0, "uci":0
}
with open("sensorcaps_untrained.jsonl","w") as f:
    for dataset in datasets:
        data = np.load(os.path.join(root_data_dir, dataset, data_file_name))
        label = np.load(os.path.join(root_data_dir, dataset, label_file_name))
        label_dict = dataset_activity_label_dict[dataset]
        last_axis = last_axis_dict[dataset]
        print(dataset)
        
        indices = random.sample(range(1, data.shape[0]), SAMPLES_PER_DATASET)
        for sample_index in tqdm(indices):
            sample_data = data[sample_index]
            key = str(int(label[sample_index, 0, last_axis]))
            sample_label = label_dict[str(int(label[sample_index, 0, last_axis]))]
            accl, gyro = data[0][:, 0:3], data[0][:, 3:6]
            accl, gyro = accl.tolist(), gyro.tolist()
            accl_str = str(accl)
            gyro_str = str(gyro)
            sensorcaps_sample = limubert_sample_to_sensorcaps(
                summary=sample_label,
                accl_str=accl_str,
                gyro_str=gyro_str
            )
            f.write(sensorcaps_sample+"\n")

hhar


  0%|                                                                                      | 0/25 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 25/25 [01:45<00:00,  4.23s/it]


motion


100%|█████████████████████████████████████████████████████████████████████████████| 25/25 [02:17<00:00,  5.50s/it]


shoaib


100%|█████████████████████████████████████████████████████████████████████████████| 25/25 [02:06<00:00,  5.06s/it]


uci


100%|█████████████████████████████████████████████████████████████████████████████| 25/25 [02:03<00:00,  4.96s/it]
