In [1]:
from EZ.word import word_DVs, word_means
from EZ.sentence import Sentence
from EZ.fixation import Fixation

import os
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import copy
import pickle

In [2]:
def split_df_on_zero(df, column_name='fix_id'):
    # Find indices where the column equals 0
    df = df.reset_index()
    split_indices = df.index[df[column_name] == 0].tolist()
    # Add end index
    split_indices.append(len(df))
    # Create a list of DataFrames
    dfs = [df.iloc[split_indices[i]:split_indices[i+1]].reset_index(drop=True) for i in range(len(split_indices)-1)]
    return dfs

In [3]:
word_info_df = pd.read_csv('/data/home/shared/onestop/OneStop_v1_20250126/lacclab_processed/ia_Paragraph.csv', engine="pyarrow")
word_info_df = word_info_df[word_info_df["article_id"] != 0] # filter out practice article
word_info_df = word_info_df[word_info_df["repeated_reading_trial"] == 0] # use only non repeated reading
word_info_df = word_info_df[word_info_df["question_preview"] == 0] # use only non question preview cases
word_info_df['IA_LABEL'] = word_info_df.IA_LABEL.replace('\t(.*)', '', regex=True)

In [4]:
paragraphs = []
for unique_paragraph_id, text_spacing_version in tqdm(word_info_df[['unique_paragraph_id', 'text_spacing_version']].drop_duplicates().values.tolist()):
			full_sn_df = word_info_df[(word_info_df.unique_paragraph_id==unique_paragraph_id) & (word_info_df.text_spacing_version == text_spacing_version)]
			first_sub = full_sn_df.drop_duplicates(["unique_paragraph_id", "text_spacing_version"]).participant_id.values[0]
			sn_sub = full_sn_df[full_sn_df.participant_id == first_sub]
			paragraphs.append(sn_sub.groupby(['unique_paragraph_id', 'text_spacing_version'])['word_length'].apply(list).reset_index())
			
paragraphs = pd.concat(paragraphs, axis=0)

paragraphs_dict = {}
for i, row in paragraphs.iterrows():
    paragraphs_dict[(row["unique_paragraph_id"], row["text_spacing_version"])] = Sentence(i, row["word_length"])

100%|██████████| 339/339 [00:16<00:00, 20.80it/s]


In [5]:
# Define the folder path
folder_path = "results/Eyettention"

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Dictionary to store DataFrames
eyettention_outputs = {}

# Loop through each CSV file and load it into a DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    eyettention_outputs[file] = df  # Store DataFrame in dictionary with filename as key

In [6]:
def process_combo(combo_df):
        """Processes a single combo in parallel."""
        combo, df_list = combo_df
        processed_fix_lists = []

        for df in df_list:
            df_to_fix_list = [Fixation(row["sp_fix_dur"], row["fix_id"], row["sp_fix_pos"]) for _, row in df.iterrows()]
            processed_fix_lists.append(df_to_fix_list)

        # Deep copy paragraph to prevent race conditions
        adjusted_paragraph = copy.deepcopy(paragraphs_dict[combo])

        for fix_list in processed_fix_lists:
            adjusted_paragraph.subj_number += 1
            adjusted_paragraph = word_DVs(adjusted_paragraph, fix_list)

        word_means(adjusted_paragraph)

        return combo, adjusted_paragraph

fold_dict = {}

for fold_file in sorted(eyettention_outputs.keys()):
    print(f"Started {fold_file}")
    processed_fold = {}
    output_df = eyettention_outputs[fold_file]
    
    # Group and split data
    grouped_dfs = {key: group for key, group in output_df.groupby(['unique_paragraph_id', 'text_spacing_version'])}
    grouped_dfs = {key: split_df_on_zero(group) for key, group in grouped_dfs.items()}

    # Parallelize at the combo level with progress tracking
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = {executor.submit(process_combo, combo_df): combo_df[0] for combo_df in grouped_dfs.items()}

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing Combos"):
            combo, adjusted_paragraph = future.result()
            processed_fold[combo] = adjusted_paragraph
        
        executor.shutdown(wait=True)

    fold_dict[fold_file] = processed_fold  # Fix: store result under `fold_file`, not string `'fold_file'`

Started full_eyettention_output_fold_0.csv


Processing Combos: 100%|██████████| 339/339 [01:57<00:00,  2.90it/s]


Started full_eyettention_output_fold_1.csv


Processing Combos: 100%|██████████| 339/339 [01:34<00:00,  3.61it/s]


In [9]:
for fold_file, combos_dict in fold_dict.items():
        cols = ["unique_paragraph_id", "text_spacing_version", "IA_ID", "SFD", "FFD", "GD", "TT", "GP", "PrF", "Pr1", "Pr2", "PrS", 
"NRegIn", "NRegOut", "NRegOutFull", "FirstPassGD", "FirstPassFFD", "FirstFixProg", "NFixations"]
        output_dict = {col: [] for col in cols}
        for combo, sentence in combos_dict.items():
                for i, word in enumerate(sentence.word):
                        output_dict["SFD"].append(word.dv.SFD)
                        output_dict["FFD"].append(word.dv.FFD)
                        output_dict["GD"].append(word.dv.GD)
                        output_dict["TT"].append(word.dv.TT)
                        output_dict["GP"].append(word.dv.GoPast)
                        output_dict["PrF"].append(word.dv.PrF)
                        output_dict["Pr1"].append(word.dv.Pr1)
                        output_dict["Pr2"].append(word.dv.Pr2)
                        output_dict["PrS"].append(word.dv.PrS)
                        output_dict["NRegIn"].append(word.dv.NRegIn)
                        output_dict["NRegOut"].append(word.dv.NRegOut)
                        output_dict["NRegOutFull"].append(word.dv.NRegOutFull)
                        output_dict["FirstPassGD"].append(word.dv.FirstPassGD)
                        output_dict["FirstPassFFD"].append(word.dv.FirstPassFFD)
                        output_dict["FirstFixProg"].append(word.dv.FirstFixProg)
                        output_dict["NFixations"].append(word.dv.NFixations)
                        output_dict["IA_ID"].append(i)
                output_dict["unique_paragraph_id"] += [combo[0]]*len(sentence.word)
                output_dict["text_spacing_version"] += [combo[1]]*len(sentence.word)

        fold_ia_report = pd.DataFrame(output_dict)
        fold_ia_report = fold_ia_report.round({col: 3 for col in fold_ia_report.select_dtypes(include=['float64']).columns})
        fold_ia_report = fold_ia_report.sort_values(["unique_paragraph_id", "text_spacing_version", "IA_ID"])
        fold_ia_report.to_csv(f"/data/home/shared/Eyettention/iaReports/{fold_file}", index=False)


In [10]:
fold_ia_report

Unnamed: 0,unique_paragraph_id,text_spacing_version,IA_ID,SFD,FFD,GD,TT,GP,PrF,Pr1,Pr2,PrS,NRegIn,NRegOut,NRegOutFull,FirstPassGD,FirstPassFFD,FirstFixProg,NFixations
2073,1_10_Adv_1,0,0,0.000,0.000,0.000,0.000,0.000,1.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000
2074,1_10_Adv_1,0,1,106.866,194.738,253.262,555.295,253.262,0.999,0.780,0.218,0.002,0.913,0.000,0.000,253.516,194.756,0.998,2.861
2075,1_10_Adv_1,0,2,109.217,195.523,242.708,446.648,316.558,0.962,0.737,0.199,0.064,0.149,0.115,0.291,257.809,195.781,0.936,2.146
2076,1_10_Adv_1,0,3,149.992,201.671,147.626,294.631,173.570,0.547,0.341,0.024,0.635,0.176,0.030,0.088,221.237,205.407,0.365,0.808
2077,1_10_Adv_1,0,4,129.831,202.662,170.182,319.071,213.826,0.713,0.562,0.015,0.423,0.219,0.063,0.172,210.294,204.854,0.577,1.139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33062,3_9_Ele_4,0,73,95.682,184.829,124.369,360.035,252.486,0.473,0.260,0.028,0.712,0.196,0.051,0.150,206.195,185.499,0.288,0.922
33063,3_9_Ele_4,0,74,91.689,184.585,121.599,370.364,244.046,0.474,0.291,0.010,0.699,0.196,0.077,0.255,191.488,185.307,0.301,0.953
33064,3_9_Ele_4,0,75,91.171,184.837,131.861,384.148,381.631,0.513,0.331,0.019,0.650,0.210,0.095,0.293,195.961,185.713,0.350,1.071
33065,3_9_Ele_4,0,76,103.562,184.922,146.699,360.558,873.936,0.502,0.338,0.031,0.631,0.142,0.168,0.422,204.108,186.470,0.369,0.985


# Backups