### Create Probability Matrix for Bradley Terry

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Directory containing the CSV files
DATA_DIR = "data"

# Function to parse semicolon-separated strings into lists
def parse_list(x, convert_type=str):
    if not isinstance(x, str):
        return []  # Return an empty list if the input is not a string
    try:
        return [convert_type(i.strip()) for i in x.split(';') if i.strip()]
    except ValueError:
        print(f"Skipping invalid value in parse_list: {x}")
        return []

# Function to read and preprocess a CSV file
def read_and_preprocess(filename):
    df = pd.read_csv(filename)
    
    # Parse response times into lists of floats and convert to seconds
    def parse_response_times(x):
        if isinstance(x, str):  # Only split if the value is a string
            try:
                return [float(t) / 1000 for t in x.split(';') if t.strip()]
            except ValueError:
                print(f"Skipping invalid response times: {x}")
                return []
        return []  # Return an empty list for invalid or missing values
    
    df['response_times'] = df['response_times'].apply(parse_response_times)
    # Parse other columns
    df['comparison_sequence'] = df['comparison_sequence'].apply(
        lambda x: [tuple(pair.split(',')) for pair in x.split(';') if isinstance(x, str)]
    )
    df['comparison_responses'] = df['comparison_responses'].apply(
        lambda x: parse_list(x, convert_type=int)
    )
    df['selected_images'] = df['selected_images'].apply(
        lambda x: parse_list(x)
    )
    
    return df

# Read all CSV files in the folder "experiment_first_30"
csv_files = [os.path.join(DATA_DIR, file) for file in os.listdir(DATA_DIR) if file.endswith('.csv')]
dfs = [read_and_preprocess(file) for file in csv_files]

# 2. Create comparison matrix
def get_all_image_ids(dfs):
    image_ids = set()
    for df in dfs:
        for comparisons in df['comparison_sequence']:
            for img1, img2 in comparisons:
                try:
                    img1_id, img2_id = int(img1), int(img2)
                    # Only add images that are not attention checks (-1 and 0)
                    if img1_id > 0:
                        image_ids.add(img1_id)
                    if img2_id > 0:
                        image_ids.add(img2_id)
                except ValueError:
                    print(f"Skipping invalid image pair: {img1}, {img2}")
                    continue
    return sorted(list(image_ids))

# Get all unique image IDs and create a mapping
image_ids = get_all_image_ids(dfs)
id_to_index = {id_: idx for idx, id_ in enumerate(image_ids)}
num_images = len(image_ids)

print(f"Total number of unique images (excluding attention checks): {num_images}")

def update_comparison_matrix(matrix, total_matrix, comparisons, responses):
    """
    Update both the wins matrix and total comparisons matrix.
    
    Parameters:
    matrix: Matrix counting wins
    total_matrix: Matrix counting total comparisons between pairs
    comparisons: List of image pairs compared
    responses: List of responses (0 or 1)
    """
    for (img1, img2), response in zip(comparisons, responses):
        try:
            img1_id, img2_id = int(img1), int(img2)
            # Skip comparisons involving attention check images
            if img1_id <= 0 or img2_id <= 0:
                continue
            
            idx1, idx2 = id_to_index[img1_id], id_to_index[img2_id]
            if response == 0:  # First image won
                matrix[idx1][idx2] += 1
            else:  # Second image won
                matrix[idx2][idx1] += 1

            # Update total comparisons for this pair
            total_matrix[idx1][idx2] += 1
            total_matrix[idx2][idx1] += 1
        except (ValueError, KeyError):
            print(f"Skipping invalid comparison: {img1}, {img2}, {response}")
            continue

# Initialize matrices
comparison_matrix = np.zeros((num_images, num_images), dtype=int)
total_comparisons_matrix = np.zeros((num_images, num_images), dtype=int)

# Update matrices with data from all files
for df in dfs:
    for comparisons, responses in zip(df['comparison_sequence'], df['comparison_responses']):
        update_comparison_matrix(comparison_matrix, total_comparisons_matrix, comparisons, responses)

# Create probability matrix (scaled to [0,1])
probability_matrix = np.zeros((num_images, num_images))
with np.errstate(divide='ignore', invalid='ignore'):  # Handle division by zero
    probability_matrix = comparison_matrix / total_comparisons_matrix
probability_matrix = np.nan_to_num(probability_matrix, 0)  # Replace NaN with 0

# Save the matrices
os.makedirs("npy", exist_ok=True)
np.save("npy/probability_matrix.npy", probability_matrix)
np.save("npy/id_to_index.npy", id_to_index)


### Survey Responses

In [9]:
import pandas as pd
import os

q1 = "What strategy did you use to rate surprise?"
q2 = "Which types of images were the (a) most and (b) least visually complex to you?"
q3 = "Which types of image pairs took you longest to decide between? What made these comparisons particularly challenging?"
q4 = "Is there anything else you would like to tell us?"

# Directory containing the CSV files
DATA_DIR = "data"

# Function to parse semicolon-separated strings into lists
def parse_list(x, convert_type=str):
    if not isinstance(x, str):
        return []  # Return an empty list if the input is not a string
    try:
        return [convert_type(i.strip()) for i in x.split(';') if i.strip()]
    except ValueError:
        return []

# Function to read and preprocess a CSV file
def read_and_preprocess(filename):
    df = pd.read_csv(filename)
    
    # Parse columns as necessary
    df['comparison_sequence'] = df['comparison_sequence'].apply(
        lambda x: [tuple(pair.split(',')) for pair in x.split(';')] if isinstance(x, str) else []
    )
    df['comparison_responses'] = df['comparison_responses'].apply(
        lambda x: parse_list(x, convert_type=int)
    )
    return df

# Function to analyze attention checks and print answers in a structured format
def analyze_attention_checks_and_answers(data_dir):
    csv_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.csv')]
    
    # Dictionary mapping questions to their respective column names
    question_mapping = {
        q1: 'strategy_response',
        q2: 'most_complex_response',
        q3: 'challenging_pairs_response',
        q4: 'additional_comments'
    }
    
    answers_by_question = {q: [] for q in question_mapping.keys()}
    
    for file in csv_files:
        df = read_and_preprocess(file)
        
        for question, column in question_mapping.items():
            if column in df.columns:
                for user_id, answer in zip(df['subject_id'], df[column].dropna()):
                    answers_by_question[question].append(f"{user_id}: \"{answer}\"")
    
    for question, answers in answers_by_question.items():
        print(f"\n{question}")
        for answer in answers:
            print(answer)

# Run the analysis
analyze_attention_checks_and_answers(DATA_DIR)


What strategy did you use to rate surprise?
g7rm2q6d: "busyness of photo, also if it was clear what was happening and actual clarity of photo"
03ha4ctv: "How long it took me to identify key elements of the picture and the complexity"
1hjuf7z8: "Which image was the busiest or took the most time to focus on."
f7uhv603: "how full the screen was of a particular picture"
kwvbtz50: "more objects "
gh7844q8: "quality of the image and the content"
6a4cxr8z: "I chose the items that appeared to have more detail in them"
9roqhlqw: "I considered how things would have been taken - whether things were standalone or real action"
l8nyjbct: "How common the situation was, the angle the image was taken at, the number of colours, the number of subjects and how in focus they were."
8vdqc68e: "The busier the picture."
54ad1k1z: "if they look confusing or unreal"
y1v6s626: "At first I was thinking  about the colours,  people and background.  I then thought about how complex the act was in the photo."
7rk3pb

### Attention Checks

In [10]:
# Function to determine participants who passed or failed attention checks
def evaluate_attention_checks(data_dir):
    csv_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.csv')]
    passed_checks = set()
    failed_checks = set()

    for file in csv_files:
        df = read_and_preprocess(file)

        # Iterate over participants and their responses
        for subject_id, seq, responses in zip(df['subject_id'], df['comparison_sequence'], df['comparison_responses']):
            failed = False
            for (img1, img2), resp in zip(seq, responses):
                if img1 in ['0', '-1'] or img2 in ['0', '-1']:
                    # Attention check found; validate response
                    if (img1 == '-1' or img2 == '-1') and resp not in [0, 1]:
                        failed = True
                        break
            
            if failed:
                failed_checks.add(subject_id)
            else:
                passed_checks.add(subject_id)

    # Ensure participants are not double-counted
    passed_checks.difference_update(failed_checks)

    print(f"Total participants who passed all attention checks: {len(passed_checks)}")
    print(f"Total participants who failed any attention check: {len(failed_checks)}")
    print("\nParticipants who failed attention checks:")
    print(failed_checks)
    print("\nParticipants who passed all attention checks:")
    print(passed_checks)

# Run the function
evaluate_attention_checks(DATA_DIR)


Total participants who passed all attention checks: 86
Total participants who failed any attention check: 0

Participants who failed attention checks:
set()

Participants who passed all attention checks:
{'9tns36ym', 'p830hfem', 'cek5a5jy', '3ndmkkpk', 'b497jvt8', '19lorugu', 'hf9qlgbk', '9zbt0azr', 'q2j4p80u', 'r3unt43e', 'q7amfmu2', '7gchfs9p', 'jfb87cmc', 'qa483pta', 'ntbbpltx', 'e3285e2u', 'jwv9wqco', 'r9hrodrb', 'd59qf4yf', 'jdltxykn', 'd5zq23fd', 'dsn31hj2', 'q739z0vq', '8pv08mtv', 'pu4xpvds', '6sz43s17', 'gfn3x9ox', '6467fby0', 'spjw6ttg', '3stqzdmp', 'gh7844q8', '7vv8pbk6', '6a4cxr8z', '4pdvvbuz', 'kwvbtz50', 'ztuoqmoh', 'gekub9k4', 'w1kmru7h', '59htwv8r', 'uxnya8ps', '6u7re3rw', 'g7rm2q6d', 'rv34nvug', '8vdqc68e', 'snlf8y6g', 'pucl833s', '09emd5pv', '69k94ea9', '7p91sh9k', 'vj26nz66', '31r7asyp', 'kj0mo3ze', 'o1rrllbh', 'l9jn6pjh', 'sy5qo572', 'xuy6r6c7', '54ad1k1z', '1hjuf7z8', 'f7uhv603', 'l8nyjbct', 'sp60fyxq', 'k0uk21j0', 'asvmd5c9', 'efnn9530', 'y1v6s626', '0nvj9y1b', 'rw