### Import packages

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import avro
import avro.schema
from avro.datafile import DataFileReader
from avro.io import DatumReader
import pandas as pd
from datetime import datetime
import pytz
import neurokit2 as nk

### Define paths

In [2]:
file_ref = pd.read_excel('/Users/dhwanishah/Desktop/MS/VR-Dhwani/avros-to-analyze.xlsx')
input_dir = '/Users/dhwanishah/Desktop/MS/VR-Dhwani/subject-avros'
output_dir = '/Users/dhwanishah/Desktop/MS/VR-Dhwani/subject-csvs'

### Define functions

In [3]:
def extract_eda_from_avro(file_path):
    with open(file_path, 'rb') as avro_file:
        reader = DataFileReader(avro_file, DatumReader())
        record = next(reader)
        rawData = record.get('rawData', None)
        eda = rawData.get('eda', None)
    return eda

In [4]:
def convert_est_edt_to_utc(eastern_time):
    eastern = pytz.timezone('America/New_York')
    eastern_time = eastern.localize(eastern_time)
    utc_time = eastern_time.astimezone(pytz.utc)
    return utc_time

### Process data

In [5]:
for index, row in file_ref.iterrows():
    input_path = (os.path.join(input_dir, row['SubID'], row['FileName-1']))
    output_path = f"{output_dir}{row['SubID']}/session-{row['Session']}"
    
    start_time = convert_est_edt_to_utc(row['Start']).replace(tzinfo=None)
    start_time = pd.to_datetime(start_time, unit='s', utc=True)
    
    end_time = convert_est_edt_to_utc(row['End']).replace(tzinfo=None)
    end_time = pd.to_datetime(end_time, unit='s', utc=True)
    
    data = extract_eda_from_avro(input_path)
    values = data['values']
    timestamp_start = data['timestampStart']
    sampling_frequency = data['samplingFrequency']
    print(f"sampling frequency:{sampling_frequency}")
    
    # If eda data is split into 2 files, load in the values from file2
    if isinstance(row['FileName-2'], str):
        input_path2 = (os.path.join(input_dir, row['SubID'], row['FileName-2']))
        data2 = extract_eda_from_avro(input_path2)
        values2 = data2['values']
        values.extend(values2)
    
    # Smooth eda values
    values_smoothed = nk.signal_smooth(values, method = 'convolution', kernel = 'boxcar', size=3)
    
    # Convert timestamp_start from microseconds to seconds
    timestamp_start_seconds = timestamp_start / 1_000_000
    
    # Generate a list of timestamps
    time_seconds = list(range(len(values)))
    timestamps = [timestamp_start_seconds + t / sampling_frequency for t in time_seconds]
    
    # Convert timestamps to datetime objects
    datetime_objects = pd.to_datetime(timestamps, unit='s', utc=True)
    
    # Create a DataFrame with timestamps, raw, EDA values, and smoothed EDA values
    df = pd.DataFrame({'timestamp': datetime_objects,
                       'eda_raw': values, 
                       'eda_smoothed': values_smoothed})
    
    # Trim the DataFrame based on start and end time points of the experimental session
    df = df[(df['timestamp'] >= start_time) & (df['timestamp'] <= end_time)].reset_index(drop=True)
    
    # Process the smoothed eda signal; add processed signals to DataFrame 
    signals, info = nk.eda_process(df['eda_smoothed'], sampling_rate=4)
    df = pd.concat([df, signals], axis=1)
    
    # Save the DataFrame as a csv
    df.to_csv(f"{output_path}.csv", index=False)

sampling frequency:4.0


OSError: Cannot save file into a non-existent directory: '/Users/dhwanishah/Desktop/MS/VR-Dhwani/subject-csvssub-1'