## Pre-processing code
This code will convert the original data file into a csv for the Aruba dataset. Saved as pre_processed_data.csv

In [2]:
import re
import datetime
import time
import csv
import numpy as np
import pandas as pd

In [4]:
# Open the input file for reading
with open("Raw Data/Aruba_17/data", "r") as f:
    data = f.readlines()
    # only read the first 1000 lines

# Create an empty list to store the processed data
processed_data = []

# Possible activities
activities = ["Meal_Preparation", "Relax", "Eating", "Work", "Sleeping", "Wash_Dishes", "Bed_to_Toilet", "Enter_Home", "Leave_Home", "Housekeeping", "Respirate"]

# Loop through each line of the data
for line in data:
    # Split the line into its components
    components = re.split("\s+", line.strip())

    date = components[0]
    time = components[1]
    device_id = components[2]
    device_status = components[3]
    if len(components) > 4:
        activity = components[4]
        activity_status = components[5]

    try:
        timestamp = datetime.datetime.strptime(f'{date} {time}', '%Y-%m-%d %H:%M:%S.%f')
        timestamp = int(timestamp.timestamp())

    except ValueError:
        timestamp = datetime.datetime.strptime(f'{date} {time}', '%Y-%m-%d %H:%M:%S')
        timestamp = int(timestamp.timestamp())

    if device_id.startswith("M"):
        if device_status.startswith("ON"):
            device_status = "ON"
        elif device_status.startswith("OFF"):
            device_status = "OFF"

    # Append the processed data to the list
    if len(components) > 4:
        processed_data.append([timestamp, device_id, device_status, activity, activity_status])
    else:
        processed_data.append([timestamp, device_id, device_status, "", ""])

# Write the processed data to a new file
with open("Processed Data/Aruba_17/pre_processed_data.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Timestamp", "Device ID", "Status", "Activity", "Activity Status"])
    for data in processed_data:
        writer.writerow(data)

## Model Processing Code
The original data is saved in a way that the model cannot use. The model needs the data to be numerical. This code will convert the previous file into a csv that the model can be trained on. Saved as processed_data.csv

In [5]:
from sklearn.preprocessing import LabelEncoder
# Load the processed data file
data = pd.read_csv("Processed Data/Aruba_17/pre_processed_data.csv")

# Encode the following columns: Timestamp,Device ID,Status,Activity,Activity Status
timestamp_encoder = LabelEncoder()
device_id_encoder = LabelEncoder()
status_encoder = LabelEncoder()
activity_encoder = LabelEncoder()
activity_status_encoder = LabelEncoder()

timestamp_encoder.fit(data['Timestamp'])
device_id_encoder.fit(data['Device ID'])
status_encoder.fit(data['Status'])
activity_encoder.fit(data['Activity'])
activity_status_encoder.fit(data['Activity Status'])

data['Timestamp'] = timestamp_encoder.transform(data['Timestamp'])
data['Device ID'] = device_id_encoder.transform(data['Device ID'])
data['Status'] = status_encoder.transform(data['Status'])
data['Activity'] = activity_encoder.transform(data['Activity'])
data['Activity Status'] = activity_status_encoder.transform(data['Activity Status'])

data.to_csv('Processed Data/Aruba_17/processed_data.csv', index=False)

## Model Post-processing Code
This code will inport the prediction data from the model. It will then convert the Label Encoded data back into the original labels. Saved as COMPLETE_PREDICTION.csv

In [7]:
data = pd.read_csv("Predictions/Aruba_17_prediction.csv")
# data = pd.read_csv("Processed Data/Aruba_17/processed_data.csv")

# use inverse_transform to get the original values

data['Timestamp'] = timestamp_encoder.inverse_transform(data['Timestamp'])
data['Device ID'] = device_id_encoder.inverse_transform(data['Device ID'])
data['Status'] = status_encoder.inverse_transform(data['Status'])
data['Activity'] = activity_encoder.inverse_transform(data['Activity'])
data['Activity Status'] = activity_status_encoder.inverse_transform(data['Activity Status'])

# undo the timestamp code to get the original two columns, date and time, in the following form: 2010-11-04 00:03:50.209589
data['Timestamp'] = data['Timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S.%f'))

# split the timestamp column into two columns, date and time
data[['Date', 'Time']] = data['Timestamp'].str.split(' ', 1, expand=True)

# drop the timestamp column
data = data.drop(columns=['Timestamp'])

# reorder the columns
data = data[['Date', 'Time', 'Device ID', 'Status', 'Activity', 'Activity Status']]

# save the data to a new file
data.to_csv('Predictions/Aruba_17_completed_prediction.csv', index=False)


IndexError: arrays used as indices must be of integer (or boolean) type