In [1]:
import pandas as pd
import numpy as np

# --- Phase 1 Script ---

# 1. Configuration and Constants
file_path = "./output_data_csv/data01_direction4priors.csv"
# This is our empirically derived total fixed/systematic time delay in seconds
REFINED_FIXED_TIME_S = 1.5214

# 2. Load and Sort Data
try:
    df = pd.read_csv(file_path)
    print("Successfully loaded the dataset.")
    print(f"Original shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}. Please check the path.")
    exit()

df = df.sort_values(by=['subject_id', 'run_id', 'trial_index']).reset_index(drop=True)

# 3. Feature Engineering and Calculations

# Helper function for circular distance
def circular_distance(angle1, angle2):
    """Calculates the shortest distance between two angles in degrees."""
    diff = angle1 - angle2
    return (diff + 180) % 360 - 180

# Convert (x, y) estimates to an angle
df['estimate_angle'] = np.degrees(np.arctan2(df['estimate_y'], df['estimate_x']))

# Calculate error
df['error'] = circular_distance(df['estimate_angle'], df['motion_direction'])

# Create lagged features (for trial-to-trial analysis)
df['previous_estimate_angle'] = df.groupby(['subject_id', 'run_id'])['estimate_angle'].shift(1)

# Create the main feature for our regression model
df['feature_dist_prev_est_curr_stim'] = circular_distance(df['previous_estimate_angle'], df['motion_direction'])

# Calculate reaction time from trial_time
df['next_trial_time'] = df.groupby(['subject_id', 'run_id'])['trial_time'].shift(-1)
df['total_trial_duration'] = df['next_trial_time'] - df['trial_time']
df['reaction_time_calc'] = df['total_trial_duration'] - REFINED_FIXED_TIME_S

# 4. Column and NaN Management

# Drop original/intermediate columns that are no longer needed
columns_to_drop = ['reaction_time', 'raw_response_time', 'next_trial_time', 'total_trial_duration']
df.drop(columns=columns_to_drop, inplace=True)

# Precisely drop rows where essential data for modeling is missing.
# Our model needs 'error' and our main feature. If either of these are NaN,
# the row is not usable for our specific analysis.
essential_columns = ['error', 'feature_dist_prev_est_curr_stim']
df_final_clean = df.dropna(subset=essential_columns).reset_index(drop=True)

# 5. Final Output
print("\n--- Final Clean DataFrame ---")
print(f"Shape after precise cleaning: {df_final_clean.shape}")
print("\nDataFrame Info:")
df_final_clean.info()

print("\nFirst 5 rows of the final clean DataFrame:")
print(df_final_clean.head())

Successfully loaded the dataset.
Original shape: (83213, 16)

--- Final Clean DataFrame ---
Shape after precise cleaning: (82825, 19)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82825 entries, 0 to 82824
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   experiment_name                  82825 non-null  object 
 1   subject_id                       82825 non-null  int64  
 2   session_id                       82825 non-null  int64  
 3   run_id                           82825 non-null  int64  
 4   experiment_id                    82825 non-null  int64  
 5   prior_std                        82825 non-null  int64  
 6   prior_mean                       82825 non-null  int64  
 7   trial_index                      82825 non-null  int64  
 8   trial_time                       82825 non-null  float64
 9   motion_direction                 82825 non-null  int6