In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

files_to_delete = [
    "verbatim_questions_only.csv",
    "verbatim_A.csv",
    "verbatim_B.csv",
    "verbatim_C.csv",
    "verbatim_D.csv",
    "value_A.csv",
    "value_B.csv",
    "value_C.csv",
    "value_D.csv",
    "rrnl_all_tidy_cleaned_noUndefined.csv"
]

for file in files_to_delete:
    if os.path.exists(file):
        os.remove(file)
        print(f"Deleted: {file}")
    else:
        print(f"Not found: {file}")   

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)


In [None]:
df = pd.read_csv("rrnl_all_tidy(6).csv")

original_rows = len(df)
print("Number of rows:", original_rows)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
columns_to_drop = [
    "Reject reason",
    "reject time",
    "instruction",
    "description",
    "trialNumber"
]

df = df.drop(columns=columns_to_drop, errors='ignore')

In [None]:
df.columns

In [None]:
original_rows = len(df)

df = df[df["correctAnswer"].str.strip().str.lower() != "undefined"]

new_rows = len(df)

print("Rows removed:", original_rows - new_rows)
print("Remaining rows:", new_rows)


In [None]:
df.to_csv("rrnl_all_tidy_cleaned_noUndefined.csv", index=False)

### Verbatim Questions Code

In [None]:
# Verbatim Qs
verbatim_questions = [
    "Is fasting blood glucose normal?",
    "Is two hours postprandial blood glucose normal?",
    "Is systolic normal?",
    "Is diastolic normal?"
]


In [None]:
# Grab only rows with verbatim questions
verbatim_df = df[df["responsePrompt"].isin(verbatim_questions)].copy()

verbatim_df.head()


In [None]:
#Verbatim CSV 
verbatim_df.to_csv("verbatim_questions_only.csv", index=False)


In [None]:
verbatim_df["correct"] = (
    verbatim_df["answer"].astype(str).str.strip().str.lower() ==
    verbatim_df["correctAnswer"].astype(str).str.strip().str.lower()
).astype(int)

In [None]:
verbatim_df["graphGroup"] = verbatim_df["trialId"].str[0]

In [None]:
accuracy_by_graph = (
    verbatim_df
    .groupby("graphGroup")["correct"]
    .mean()
    .mul(100)
    .round(2)
)

print(accuracy_by_graph)

accuracy_std = verbatim_df.groupby("graphGroup")["correct"].std()

group_counts = verbatim_df.groupby("graphGroup")["correct"].count()

accuracy_se = (accuracy_std / np.sqrt(group_counts)) * 100

In [None]:
plt.figure(figsize=(10, 6))
plt.errorbar(
    ['V-A', 'V-B', 'V-C', 'V-D'],   
    accuracy_by_graph.values,          
    yerr=accuracy_std.values,          
    fmt='o',                          
    linestyle='--',                   
    linewidth=2, 
    markersize=8, 
    capsize=5,                            
    label='Participants ages 18-40'
)
plt.xlabel('Graph Group')
plt.ylabel('Accuracy rate (%)')
plt.title('Verbatim comprehension task')
plt.grid(True, alpha=0.3)
plt.ylim([95, 100])
plt.legend()
plt.show()


In [None]:
A_df = verbatim_df[verbatim_df["graphGroup"] == "A"].copy()
B_df = verbatim_df[verbatim_df["graphGroup"] == "B"].copy()
C_df = verbatim_df[verbatim_df["graphGroup"] == "C"].copy()
D_df = verbatim_df[verbatim_df["graphGroup"] == "D"].copy()


In [None]:
A_df.to_csv("verbatim_A.csv", index=False)
B_df.to_csv("verbatim_B.csv", index=False)
C_df.to_csv("verbatim_C.csv", index=False)
D_df.to_csv("verbatim_D.csv", index=False)

### Value Questions Code

In [None]:
value_questions = [
    "What is the fasting blood glucose value?",
    "What is the two hours postprandial blood glucose value?",
    "What is the systolic value?",
    "What is the diastolic value?"
]

In [None]:
value_df = df[df["responsePrompt"].isin(value_questions)].copy()

value_df.head()

In [None]:
value_df["correct"] = (
    value_df["answer"].astype(str).str.strip().str.lower() ==
    value_df["correctAnswer"].astype(str).str.strip().str.lower()
).astype(int)

In [None]:
value_df["graphGroup"] = value_df["trialId"].str[0]

In [None]:
for group in ["A", "B", "C", "D"]:
    temp_df = value_df[value_df["graphGroup"] == group]
    temp_df.to_csv(f"value_{group}.csv", index=False)

In [None]:
value_accuracy_by_graph = (
    value_df
    .groupby("graphGroup")["correct"]
    .mean()
    .mul(100)
    .round(2)
)

print(value_accuracy_by_graph)

value_accuracy_std = value_df.groupby("graphGroup")["correct"].std()

value_group_counts = value_df.groupby("graphGroup")["correct"].count()

value_accuracy_se = (value_accuracy_std / np.sqrt(group_counts)) * 100

In [None]:
plt.figure(figsize=(10, 6))
plt.errorbar(
    ['V-A', 'V-B', 'V-C', 'V-D'],        
    value_accuracy_by_graph.values,        
    yerr=value_accuracy_std.values,            
    fmt='o',                               
    linestyle='--',                       
    linewidth=2, 
    markersize=8, 
    capsize=5,                             
    label='Participants ages 18-40'
)
plt.xlabel('Graph Group')
plt.ylabel('Accuracy rate (%)')
plt.title('Value interpretation task')
plt.grid(True, alpha=0.3)
plt.ylim([95, 100])
plt.legend()
plt.show()
