# SensorCaps
SensorCaps utlizes IMU sensor event or human activity characteristic feature and narration extraction. In summary, it's "sensor event captioning" or "human activity captioning." This is essential for generating context-aware question-answer pairs.

In [1]:
import os
import json
import numpy as np
from tqdm import tqdm

In [2]:
def sensor_subsampled_string(data, n=60):
    if len(data)/n>10:
        print(f"High compression: {len(data)/n}")
    indices = np.round(np.linspace(0, len(data) - 1, n)).astype(int)
    return str([list(np.round(data[idx],6)) for idx in indices])

In [7]:
from openai import OpenAI
import json


client = OpenAI()


def limubert_sample_to_sensorcaps(label, accl_str, gyro_str):
    messages = [
        {
            "role": "system",
            "content": (
                "Please consider yourself to be an expert on gyroscope and accelerometer sensor "
                "information given as a metadata of IMU datasets."
                "You are given the IMU sensor readings of a human activity. "
                "They also give you gyroscopic and accelerometer sensor data followed by "
                "'Gyroscope:' and 'Accelerometer:' respectively. "
                "They are written in a Python list of lists format and contain x, y, and z "
                "axis data respectively. Please pay attention to the values and the signs. "
                "You should be able to identify the class from gyroscope and accelerometer data. "
                "The accelerometer data is normalized by being divided by 9.8. "
                "The given IMU sensor data can be associated with one of the following classes: "
                f"{unique_classes}. "
                "Write only the name of the identified class. "
            )
        },
        {
            "role": "user",
            "content": (
                f"Gyroscope: {gyro_str}\n"
                f"Accelerometer: {accl_str}"
            )
        },
        {
            "role": "assistant",
            "content": label,
        }
    ]
    # params = {
    #     "model": "gpt-3.5-turbo",
    #     "messages": messages,
    #     "max_tokens": 30,
    # }

    # result = client.chat.completions.create(**params)
    # narration = result.choices[0].message.content
    # messages.append({"role": "assistant", "content": narration})
    sensorcaps_sample = json.dumps({"messages": messages})
    return sensorcaps_sample

In [8]:
import random
random.seed(42)
PER_CLASS_SAMPLES = 100
validation_sample = 20





with open("/hdd/LLM/limuBERT_data/dataset_activity_label.json","r") as f:
    dataset_activity_label_dict = json.load(f)

unique_classes = set()
for val in dataset_activity_label_dict.values():
    unique_classes.update(val.values())
print(unique_classes)


unique_classes_dict = {}

for idx, unique_class in enumerate(unique_classes):
    unique_classes_dict[unique_class] = idx

# UNCLEAR_LABEL = "Unclear"
# unique_classes_dict[UNCLEAR_LABEL] = idx+1

print(unique_classes_dict)


root_data_dir = "/hdd/LLM/limuBERT_data/extracted_data"
datasets = sorted(os.listdir(root_data_dir))

data_file_name = "data_20_120.npy"
label_file_name = "label_20_120.npy"

last_axis_dict = {
    "hhar": 2, "motion":0, "shoaib":0, "uci":0
}

label_count_dict = {}
for val in unique_classes_dict.keys():
    label_count_dict[val] = 0
print(label_count_dict)
all_train_data = []
all_train_label = []
all_val_data = []
all_val_label = []
for dataset in datasets: 
    data = np.load(os.path.join(root_data_dir, dataset, data_file_name))
    label = np.load(os.path.join(root_data_dir, dataset, label_file_name))
    
    last_axis = last_axis_dict[dataset]
    label_dict = dataset_activity_label_dict[dataset]
    for idx in tqdm(range(len(data))):
        # sample_index = random.randint(0, len(data))
        sample_index = idx
        sample_label = label_dict[str(int(label[sample_index, 0, last_axis]))]
        if label_count_dict[sample_label] < PER_CLASS_SAMPLES:
            label_count_dict[sample_label]+=1
            all_train_data.append(data[idx])
            all_train_label.append(sample_label)
        elif label_count_dict[sample_label] >=PER_CLASS_SAMPLES and label_count_dict[sample_label] < PER_CLASS_SAMPLES +validation_sample:
            label_count_dict[sample_label]+=1
            all_val_data.append(data[idx])
            all_val_label.append(sample_label)

        else:
            continue
# print(label_count_dict)
# print(len(all_train_data), len(all_train_label), len(all_val_data), len(all_val_label))

with open("sensor_data_for_tuning_training_10hz.jsonl","w") as f:
    for idx, data in enumerate(all_train_data):
        sample_index = idx
        sample_label = all_train_label[sample_index]
        accl, gyro = data[:, 0:3], data[:, 3:6]
        accl, gyro = accl.tolist(), gyro.tolist()
        accl_str = sensor_subsampled_string(accl)
        gyro_str = sensor_subsampled_string(gyro)
        label
        sensorcaps_sample = limubert_sample_to_sensorcaps(
            label=sample_label,
            accl_str=accl_str,
            gyro_str=gyro_str
        )
        f.write(sensorcaps_sample+"\n")

with open("sensor_data_for_tuning_validating_10hz.jsonl","w") as f:
    for idx, data in enumerate(all_val_data):
        sample_index = idx
        sample_label = all_val_label[sample_index]
        accl, gyro = data[:, 0:3], data[:, 3:6]
        accl, gyro = accl.tolist(), gyro.tolist()
        accl_str = sensor_subsampled_string(accl)
        gyro_str = sensor_subsampled_string(gyro)
        label
        sensorcaps_sample = limubert_sample_to_sensorcaps(
            label=sample_label,
            accl_str=accl_str,
            gyro_str=gyro_str
        )
        f.write(sensorcaps_sample+"\n")


{'biking', 'walking', 'decending stairs', 'jogging', 'sitting', 'lying', 'standing', 'climbing stairs'}
{'biking': 0, 'walking': 1, 'decending stairs': 2, 'jogging': 3, 'sitting': 4, 'lying': 5, 'standing': 6, 'climbing stairs': 7}
{'biking': 0, 'walking': 0, 'decending stairs': 0, 'jogging': 0, 'sitting': 0, 'lying': 0, 'standing': 0, 'climbing stairs': 0}


100%|██████████| 9166/9166 [00:00<00:00, 1165105.63it/s]


100%|██████████| 4534/4534 [00:00<00:00, 1229519.26it/s]
100%|██████████| 10500/10500 [00:00<00:00, 1160143.09it/s]
100%|██████████| 2088/2088 [00:00<00:00, 1176320.58it/s]


In [10]:
from openai import OpenAI
client = OpenAI()
training_file_name = '/hdd/LLM/SLU/fine-tuning/sensor_data_for_tuning_training_10hz.jsonl'
validation_file_name = '/hdd/LLM/SLU/fine-tuning/sensor_data_for_tuning_validating_10hz.jsonl'
training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

validation_file_id = client.files.create(
  file=open(validation_file_name, "rb"),
  purpose="fine-tune"
)

print(f"Training File ID: {training_file_id}")
print(f"Validation File ID: {validation_file_id}")

Training File ID: FileObject(id='file-C1y8wrn3erRF5HY6bT0REJ5T', bytes=3962083, created_at=1717450502, filename='sensor_data_for_tuning_training_10hz.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
Validation File ID: FileObject(id='file-kAzDmDzK73zpiCA0FBYpr5No', bytes=791851, created_at=1717450503, filename='sensor_data_for_tuning_validating_10hz.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [11]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id.id, 
    validation_file=validation_file_id.id,
    model="gpt-3.5-turbo", #need to change also hyperparamter
    # hyperparame ters={
        # "n_epochs": 15,
    #     "batch_size": 16,
    #     "learning_rate_multiplier": 0.3
    # }
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-hUNC3jdQZ5tSDytAL2mqmzAx.
Training Response: FineTuningJob(id='ftjob-hUNC3jdQZ5tSDytAL2mqmzAx', created_at=1717450566, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-2dkokpHrNWDe1oeSGFFmuDJ0', result_files=[], status='validating_files', trained_tokens=None, training_file='file-C1y8wrn3erRF5HY6bT0REJ5T', validation_file='file-kAzDmDzK73zpiCA0FBYpr5No', user_provided_suffix=None, seed=32364238, estimated_finish=None, integrations=[])
Training Status: validating_files


In [15]:
import signal
import datetime


def signal_handler(sig, frame):
    status = client.fine_tuning.jobs.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return


print(f"Streaming events for the fine-tuning job: {job_id}")

signal.signal(signal.SIGINT, signal_handler)

events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
try:
    for event in events:
        print(
            f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}'
        )
except Exception:
    print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ftjob-hUNC3jdQZ5tSDytAL2mqmzAx
2024-06-03 18:52:20 The job has successfully completed
2024-06-03 18:52:14 New fine-tuned model created: ft:gpt-3.5-turbo-0125:worcester-polytechnic-institute::9WApCtou
2024-06-03 18:52:14 Checkpoint created at step 1600 with Snapshot ID: ft:gpt-3.5-turbo-0125:worcester-polytechnic-institute::9WApCDCy:ckpt-step-1600
2024-06-03 18:52:14 Checkpoint created at step 800 with Snapshot ID: ft:gpt-3.5-turbo-0125:worcester-polytechnic-institute::9WApBfnL:ckpt-step-800
2024-06-03 18:52:08 Step 2400/2400: training loss=0.00, validation loss=0.00, full validation loss=0.27
2024-06-03 18:51:50 Step 2399/2400: training loss=0.00
2024-06-03 18:51:48 Step 2398/2400: training loss=0.00
2024-06-03 18:51:46 Step 2397/2400: training loss=0.00
2024-06-03 18:51:44 Step 2396/2400: training loss=0.00
2024-06-03 18:51:44 Step 2395/2400: training loss=0.00
2024-06-03 18:51:42 Step 2394/2400: training loss=0.00
2024-06-03 18:51:40 Step 239

In [17]:
import time

status = client.fine_tuning.jobs.retrieve(job_id).status
if status not in ["succeeded", "failed"]:
    print(f"Job not in terminal status: {status}. Waiting.")
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = client.fine_tuning.jobs.retrieve(job_id).status
        print(f"Status: {status}")
else:
    print(f"Finetune job {job_id} finished with status: {status}")
print("Checking other finetune jobs in the subscription.")
result = client.fine_tuning.jobs.list()
print(f"Found {len(result.data)} finetune jobs.")

Finetune job ftjob-hUNC3jdQZ5tSDytAL2mqmzAx finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 3 finetune jobs.


In [18]:
# Retrieve the finetuned model
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)



ft:gpt-3.5-turbo-0125:worcester-polytechnic-institute::9WApCtou
