# Dataset Processing:
## Takes two datasets (OpenBCI data & and Keylogger), cleans, merges based on timestamp, and exports individual .csv files for each class.

# Import Libraries

In [80]:
import pandas as pd
import numpy as np
import datetime

# Load Dataset

In [81]:
# Load data
header_row_number = 4
data = pd.read_csv('/content/OpenBCI-RAW-2024-03-08_10-40-02.txt', on_bad_lines='warn', header=header_row_number)

# Remove the space at the beginning of every header name
data = data.rename(columns=lambda x: x.strip())

# Load Key logs
header_row_number = 0
key_logs = pd.read_csv('/content/KEYLOGGER_2024-03-08_10_39_21.csv', on_bad_lines='warn', header=header_row_number)

# Clean Data

In [82]:
columns_to_keep = ['Sample Index',
                  'EXG Channel 0',
                  'EXG Channel 1',
                  'EXG Channel 2',
                  'EXG Channel 3',
                  'EXG Channel 4',
                  'EXG Channel 5',
                  'EXG Channel 6',
                  'EXG Channel 7',
                   'Timestamp (Formatted)']

# Remove unnecessary columns
data = data[columns_to_keep]
# Drop N/A values and count the number of dropped rows
dropped_data = data.dropna()
dropped_count = len(data) - len(dropped_data)
print(f'Dropped {dropped_count} N/A rows')

Dropped 0 N/A rows


# Merge Datasets

In [83]:
# Convert timestamps to datetime objects
data['Timestamp (Formatted)'] = pd.to_datetime(data['Timestamp (Formatted)'], format='%Y-%m-%d %H:%M:%S')
key_logs['Timestamp'] = pd.to_datetime(key_logs['Timestamp'], format='%Y-%m-%d %H:%M:%S')

# Extract date and time up to seconds
data['Timestamp (Formatted)'] = data['Timestamp (Formatted)'].dt.strftime('%Y-%m-%d %H:%M:%S')
key_logs['Timestamp'] = key_logs['Timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Merge dataframes based on timestamp
merged_data = data.merge(key_logs, left_on='Timestamp (Formatted)', right_on='Timestamp', how='left')

# Fill NaN values in the 'Class' column with the last known class
merged_data['Class'] = merged_data['Class'].fillna(method='ffill')


# Obtain Final Dataset

In [84]:
# Drop the unnecessary 'Timestamp' column
final_data = merged_data.drop(columns=['Sample Index', 'Timestamp', 'Timestamp (Formatted)'])

# Drop rows with NaN values in the 'Class' column
final_data.dropna(subset=['Class'], inplace=True)

# Reset the index
final_data.reset_index(drop=True, inplace=True)

# Export Datasets per Class

In [87]:
# Get unique values in the 'Class' column
class_values = final_data['Class'].unique()

# Get current date and time
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Iterate over unique class values and export to CSV
for class_value in class_values:
    class_data = final_data[final_data['Class'] == class_value]
    class_data = class_data.drop(columns=['Class'])
    class_data.to_csv(f'{class_value}_{timestamp}.csv', index=False)