In [None]:
from pathlib import Path

import numpy as np
import pandas as pd


In [None]:
CSV_PATH = "/content/drive/MyDrive/csv_files"
MARKER_DATA_CSV = "/content/drive/MyDrive/marker_info.csv"

In [None]:
def calculate_starting_times(data):
    """
    Calculates all starting times for each driving period.
    For example, Rest1: 15.13, City1: 16.00 -> Rest1: 0, City: 15.13
    input: data (marker data)
    """
    formated_data = data.copy()
    # Starting time is 0.
    formated_data['Rest1'] = 0
    # Exclude columns such as drive number and total.
    numeric_cols = data.iloc[:, 1:8].columns
    for index, column in enumerate(numeric_cols):
        if index == len(numeric_cols) - 1:
            pass
        else:
            next_column = numeric_cols[index + 1]
            # Calculate the time between the current col and next one.
            formated_data[next_column] = formated_data[column] + data[column]
    print("Data with Starting Times in Minutes\n", formated_data)
    return formated_data

In [None]:
def get_starting_indices(processed_markers, drive):
    """
    Gets the drive minutes for each driving periods and converts them to slices
    for the dataframe.
    inputs:
        processed_markers: marker dataframe with starting times.
        drive: drive name (str) - i.e: Drive05
    """
    signals = processed_markers[processed_markers["Driver"] == drive].iloc[:, 1:8]
    # Signals are given in 15.5Hz frequency. Marker data is in Minutes.
    signals = signals * 15.5 * 60
    return signals.astype(int).values[0]

In [None]:
def label_data(starting_indices, row_index):
    """
    Gets the row number and coverts it to a label.
    inputs:
        starting_indices (list) - list of ints for the slices,
        row_index (int)
    """
    relaxed = (row_index >= starting_indices[0] and row_index < starting_indices[1]) \
        or (row_index > starting_indices[6])

    medium = (row_index >= starting_indices[2] and row_index < starting_indices[3]) \
        or (row_index >= starting_indices[4] and row_index < starting_indices[5])

    stressed = (row_index >= starting_indices[1] and row_index < starting_indices[2]) \
        or (row_index >= starting_indices[3] and row_index < starting_indices[4]) \
        or (row_index >= starting_indices[5] and row_index < starting_indices[6])

    if relaxed:
        return 1.0
    elif medium:
        return 3.0
    elif stressed:
        return 5.0


In [None]:
def process_data(marker_data_path, csv_files_path):
    # Create a new folder called preprocessed_data if it does not exist.
    Path(csv_files_path).joinpath("preprocessed_data").mkdir(parents=True, exist_ok=True)
    # Grabs all the csv files from the path.
    csv_files = list(Path(csv_files_path).glob("*.csv"))
    all_drives = []
    # Keep only filenames from each path.
    csv_files_names = [Path(f).stem for f in csv_files]
    # Read Marker Data
    marker_data = pd.read_csv(marker_data_path)
    # Each column represents the time the drive was spent in that state.
    # For example, Drive05 spent the first 15.13 minutes resting, then
    # 16 minutes driving in the city, then 7.74 minutes driving in Highway etc.
    processed_markers = calculate_starting_times(data=marker_data)
    # Select only the drives that have full data.
    selected_drives = processed_markers["Driver"]
    for drive in selected_drives:
        if drive.lower() in csv_files_names:
            print("\n", f"Processing {drive}.")
            idx = csv_files_names.index(drive.lower())
            data = pd.read_csv(csv_files[idx])
            # Marker column is not needed.
            data = data.drop(['marker-mV'], axis=1)
            # Get the starting time for each category (rest, highway and city)
            starting_indices = get_starting_indices(processed_markers, drive)
            # Label the data according the driving times.
            data['Stress'] = data.apply(
                lambda row: label_data(starting_indices, row.name), axis=1
            )
            # Save data for each drive.
            data.to_csv(f"{csv_files_path}/preprocessed_data/{drive}.csv", index=False)
            data['Drive'] = drive
            all_drives.append(data)
    # Join all the driver data into one big dataframe.
    all_drives_data = pd.concat(all_drives, ignore_index=True)
    # Save the dataframe.
    all_drives_data.to_csv(f"{csv_files_path}/preprocessed_data/all_drives.csv", index=False)
    print(f"Data saved in the {csv_files_path}/preprocessed_data directory.")

In [None]:
process_data(MARKER_DATA_CSV,CSV_PATH)

Data with Starting Times in Minutes
     Driver  Rest1  City1  Highway1  City2  Highway2  City3  Rest2  \
0  Drive05      0  15.13     31.13  38.87     44.93  52.49  67.45   
1  Drive06      0  15.05     29.54  36.86     43.39  51.03  63.32   
2  Drive07      0  15.04     31.27  42.23     52.06  59.70  69.85   
3  Drive08      0  15.00     27.31  34.54     44.05  51.69  65.12   
4  Drive09      0  15.66     34.87  43.34     48.54  55.60  68.81   
5  Drive10      0  15.04     30.34  39.00     44.27  51.31  63.37   
6  Drive11      0  15.02     30.83  38.26     45.41  52.37  64.09   
7  Drive12      0  15.01     28.42  35.98     42.48  50.54  62.22   
8  Drive15      0  15.00     27.54  34.78     40.77  47.59  59.71   
9  Drive16      0  15.01     31.13  38.27     43.39  50.20  64.11   

   TotalTime(min)  
0           83.23  
1           78.38  
2           84.87  
3           80.19  
4           68.82  
5           78.15  
6           79.08  
7           77.23  
8           74.70  
9  

In [None]:
import pandas as pd
import numpy as np
non_nan_hr_values = []
non_nan_indices = []

# Read the CSV with Rpeak row indices
rpeak_df = pd.read_csv('/content/drive/MyDrive/csv_files/HRandpeak/rpeak_Drive05.csv')

# Read the CSV with HR values
hr_df = pd.read_csv('/content/drive/MyDrive/csv_files/HRandpeak/HR_Drive05.csv')

# Read the CSV with ECG data
ecg_df = pd.read_csv('/content/drive/MyDrive/csv_files/HRandpeak/ECG_Drive05.csv')

# Initialize a new DataFrame to store the result
result_df = ecg_df.copy()

# Create an 'HR' column in the result_df with NaN values
result_df['HR'] = 'NaN'

# Iterate through the rows in rpeak_df (excluding the last row)
for index in range(len(rpeak_df) - 1):
    rpeak_row_index = rpeak_df.loc[index, 'Rpeak']
    hr_value = hr_df.loc[index, 'HR']
    result_df.loc[rpeak_row_index, 'HR'] = hr_value
    non_nan_hr_values.append(hr_value)
    non_nan_indices.append(rpeak_row_index)

if non_nan_hr_values:
    interpolated_hr = np.interp(result_df.index, non_nan_indices, non_nan_hr_values, left=np.nan, right=np.nan)
    result_df['HR'] = interpolated_hr

# Save the result to a new CSV file
#result_df.to_csv('result_ecg_data16.csv', index=False)

In [None]:
#result_df['HR'] = result_df['HR'].replace('NaN', pd.NA).interpolate()

In [None]:
result_df.to_csv('result_ecg_data05_interpo.csv', index=False)