In [2]:
import json
import pandas as pd
import numpy as np
from pprint import pprint
import random
from collections import Counter
import os

In [3]:
def get_arguments(fname):
    with open(fname, "r") as f:
        arguments = f.read()
    arguments = json.loads(arguments)
    return arguments

In [4]:
def extract_id_from_imgname(s):
    """From the image ID from split URL"""
    assert s, "Got empty imgname!"
    img_no_ext = s.rsplit(".", maxsplit=1)[0]
    
    id = ""
    num_found = False
    for ch in img_no_ext[::-1]:
        if ch.isdigit():
            id += ch
            num_found = True
        elif num_found:
            break
    assert num_found, f"The imgname {s} had no numbers!"
    
    id = int(id[::-1])
    return id

def get_id_url_mapping(fname):
    """Get the Dict used for replacing an image ID with the URL which shows the corresponding SVG"""
    id_to_url = dict()
    with open(fname, "r") as infile:
        for url in infile:
            url = url.strip()
            if not url:
                continue
            id = extract_id_from_imgname(url.rsplit("/", maxsplit=1)[-1])
            id_to_url[id]= url
    return id_to_url

In [5]:
def arguments_to_df(arguments, id_to_url):
    df = pd.DataFrame(arguments.items(), columns=["dims","args"]).explode("args")
    df[["dim1", "dim2"]] = df["dims"].str.split(",", expand=True)
    df[["arg1", "arg2"]] = df["args"].apply(pd.Series)
    for i in [1,2]:
        df[f"url{i}"] = df[f"arg{i}"].apply(lambda x: id_to_url[x["id"]])
    df = df[["dim1", "dim2", "url1", "url2"]]
    return df

In [6]:
def add_control_rows(df, controls_dict, repeats):
    """
    Insert control rows to avoid unreliable annotators. 
    Will always be paired with an argument from the baseline.
    """
    assert "baseline" not in df["dim2"], "Currently, this function only takes the dataframe that is not column-shuffled!"
    df = df.copy()

    n_samples = len(controls_dict)*repeats
    dim1_unique_subset = df[(df["dim1"]=="baseline")].drop_duplicates(subset=["dim1","url1"])
    samples1 = dim1_unique_subset.sample(len(dim1_unique_subset), replace=False) #Make sure all are seen before duplicates
    if len(samples1) == n_samples:
        samples = samples1
    else:
        samples2 = dim1_unique_subset.sample(n_samples-len(dim1_unique_subset), replace=True)
        samples = pd.concat([samples1, samples2])

    # Overwrite dim2 with controls (columns are shuffled later, in another function)
    dim2url2_1 = [(f"control{key}", url) for key,url in controls_dict.items()] #Make sure all are seen before duplicates
    dim2url2_2 = [(f"control{key}", url) for key,url in controls_dict.items()]*(repeats-1)
    random.shuffle(dim2url2_1) #Inplace
    random.shuffle(dim2url2_2) #Inplace
    dim2, url2 = list(zip(*dim2url2_1+dim2url2_2))
    samples["dim2"] = dim2
    samples["url2"] = url2

    df = pd.concat([df, samples])
    return df

In [7]:
def shuffle_columns(df):
    """Remove bias by shuffling order of arguments. E.g., avoid one dimension always being displayed first."""
    df = df.copy()
    df["swapped"] = None
    for i in range(len(df)):
        if random.random() < 0.5: #50% chance
            df["dim1"].iloc[i], df["dim2"].iloc[i] = df["dim2"].iloc[i], df["dim1"].iloc[i] #Swap 1 and 2
            df["url1"].iloc[i], df["url2"].iloc[i] = df["url2"].iloc[i], df["url1"].iloc[i] #Swap 1 and 2
            df["swapped"].iloc[i] = True
        else:
            df["swapped"].iloc[i] = False
    return df.copy() # Copying avoids weird interactions

In [8]:
def create_control_url_file(control_folder = "./control_samples/"):
    files = [f"https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image-control-{i}-txt.svg" for i in range(1, len(os.listdir(control_folder)) + 1)]
    with open("control_image_urls.txt", "w") as f:
        f.write("\n".join(files))
    return None
# create_control_url_file()

In [9]:
arguments = get_arguments("total_5_arguments_for_mturk.json")
id_to_url = get_id_url_mapping("argument_image_urls.txt")
pairs = arguments_to_df(arguments, id_to_url)
id_to_url_control = get_id_url_mapping("control_image_urls.txt")
pairs_with_control = add_control_rows(pairs, id_to_url_control, 2)
col_shuffled_pairs_with_control = shuffle_columns(pairs_with_control)

# Re-run sampling below, ensuring we had at least 10% controls in the first 30 samples, 
# which was used for our final test batch
# num_control_in_top_30 = 0
# while num_control_in_top_30 < 3:
#     final_pairs = col_shuffled_pairs_with_control.sample(frac=1)
#     control_in_top_30_1 = np.array([("control" in dim) for dim in final_pairs["dim1"][:30]])
#     control_in_top_30_2 = np.array([("control" in dim) for dim in final_pairs["dim2"][:30]])
#     num_control_in_top_30 = sum(control_in_top_30_1)+sum(control_in_top_30_2)
# control_locations_in_top_30 = np.where(control_in_top_30_1 | control_in_top_30_2)[0]+1
# print(num_control_in_top_30, f"control samples in top 30 (samples {control_locations_in_top_30}):")
# final_pairs.head(30)[control_in_top_30_1 | control_in_top_30_2]

In [10]:
# print(Counter(final_pairs["dim1"]))
# print()
# print(Counter(final_pairs["dim2"]))

In [11]:
# save = True

# if save:
#     final_pairs.to_csv(f"mturk_batches/full_batch_with_swapped_column.csv", index = False)
#     final_pairs.drop(columns=["swapped"]).to_csv(f"mturk_batches/full_batch.csv", index = False)

# final_pairs.drop(columns=["swapped"]).head(30)

In [12]:
import json
with open('total_5_arguments_for_mturk.json') as matchfile:
    matches = json.load(matchfile)
print(len(matches))

def get_id_to_text(matches):
    id_to_text = {}
    text_to_id = {}
    for dim in matches:
        for match in matches[dim]:
            for arg in match['arg1'],match['arg2']:
                id_to_text[arg['id']] = arg['text']
                text_to_id[arg['text']] = arg['id']
    return id_to_text, text_to_id
id_to_text,text_to_id = get_id_to_text(matches)

36


In [25]:
df = pd.read_csv("responses/final_data.csv")
print(df.head())
cols = ['Title', 'Description', 'Keywords', 'AssignmentId', 'WorkerId', 'SubmitTime', 'WorkTimeInSeconds', 'Input.url1', 'Input.url2', 'Answer.equal.label']
df = df[cols]
df.head()

                            HITId                       HITTypeId  \
0  36818Z1KV3D2ZPTIDDLD36JFDXOA39  3WB9XW3MQVGGET76CPEKB3IGGMIKCL   
1  36818Z1KV3D2ZPTIDDLD36JFDXOA39  3WB9XW3MQVGGET76CPEKB3IGGMIKCL   
2  36818Z1KV3D2ZPTIDDLD36JFDXOA39  3WB9XW3MQVGGET76CPEKB3IGGMIKCL   
3  36818Z1KV3D2ZPTIDDLD36JFDXOA39  3WB9XW3MQVGGET76CPEKB3IGGMIKCL   
4  36818Z1KV3D2ZPTIDDLD36JFDXOA39  3WB9XW3MQVGGET76CPEKB3IGGMIKCL   

                                 Title  \
0  Select the most convincing argument   
1  Select the most convincing argument   
2  Select the most convincing argument   
3  Select the most convincing argument   
4  Select the most convincing argument   

                                         Description              Keywords  \
0  Read two texts and select which one provided t...  text, classification   
1  Read two texts and select which one provided t...  text, classification   
2  Read two texts and select which one provided t...  text, classification   
3  Read two texts an

Unnamed: 0,Title,Description,Keywords,AssignmentId,WorkerId,SubmitTime,WorkTimeInSeconds,Input.url1,Input.url2,Answer.equal.label
0,Select the most convincing argument,Read two texts and select which one provided t...,"text, classification",308XBLVESI42OLH6KJR75Z1LU0LRB2,A2Q1YS118AO2BP,Fri Dec 01 03:54:52 PST 2023,424,https://tools.danskspeedcubingforening.dk/dont...,https://tools.danskspeedcubingforening.dk/dont...,Argument 1
1,Select the most convincing argument,Read two texts and select which one provided t...,"text, classification",30OG32W0SUBFGQ1Y13CGXN2WW0ANEB,A1F1BIPJR11LSR,Fri Dec 01 03:57:32 PST 2023,234,https://tools.danskspeedcubingforening.dk/dont...,https://tools.danskspeedcubingforening.dk/dont...,Argument 1
2,Select the most convincing argument,Read two texts and select which one provided t...,"text, classification",384PI804XS1D5DE91SUNBSAYPYK0SQ,A2HM35CWB7IIFM,Fri Dec 01 06:35:00 PST 2023,23,https://tools.danskspeedcubingforening.dk/dont...,https://tools.danskspeedcubingforening.dk/dont...,Argument 1
3,Select the most convincing argument,Read two texts and select which one provided t...,"text, classification",3EICBYG644W402QZQEETESLC24UJCC,A3HAPMZV7CYDY8,Fri Dec 01 04:28:35 PST 2023,490,https://tools.danskspeedcubingforening.dk/dont...,https://tools.danskspeedcubingforening.dk/dont...,Argument 2
4,Select the most convincing argument,Read two texts and select which one provided t...,"text, classification",3FTOP5WARFOK6AVJEYL9JRJBTFBJ0X,A3OZ8KF0HWSVWK,Fri Dec 01 08:59:46 PST 2023,320,https://tools.danskspeedcubingforening.dk/dont...,https://tools.danskspeedcubingforening.dk/dont...,Argument 1


In [26]:
arguments = get_arguments("total_5_arguments_for_mturk.json")
id_to_url = get_id_url_mapping("argument_image_urls.txt")
url_to_id = {url: id for id, url in id_to_url.items()}
pairs = arguments_to_df(arguments, id_to_url)
id_to_url_control = get_id_url_mapping("control_image_urls.txt")
url_to_id_control = {url: id for id, url in id_to_url_control.items()}
pairs_with_control = add_control_rows(pairs, id_to_url_control, 2)
col_shuffled_pairs_with_control = shuffle_columns(pairs_with_control)

print(id_to_text)
print(id_to_url)
print(id_to_url_control)
arguments['baseline,support'][:2]

{0: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image0.svg', 1: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image1.svg', 10: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image10.svg', 100: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image100.svg', 101: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image101.svg', 102: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image102.svg', 103: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image103.svg', 104: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image104.svg', 105: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image105.svg', 106: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image106.svg', 107: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image107.svg', 108: 'https://tools.danskspeedcubingforening.dk/dontlook/svg_img/image108.svg', 109: 'https://tools.danskspeedcubingforening.dk/d

[{'arg1': {'text': "I understand that you may be skeptical about climate change, but I implore you to consider the overwhelming scientific evidence that supports its reality. Climate change is not a political agenda or a conspiracy, but a natural phenomenon that is backed by decades of research and data from multiple lines of evidence.\n\nFirstly, let's look at the basics. The Earth's climate has always changed, but the current warming trend is happening at an unprecedented rate. The average global temperature has risen by about 1°C since the late 1800s, and the past four decades have been the warmest on record. This rapid warming is largely driven by human activities that release greenhouse gases, such as carbon dioxide and methane, into the atmosphere. These gases trap heat and cause the Earth's temperature to rise.\n\nSecondly, the effects of climate change are already being felt. Glaciers are melting at an alarming rate, sea levels are rising, and extreme weather events like heatwa

In [27]:
full_batch = pd.read_csv("mturk_batches/full_batch.csv")
full_batch.head()

final_df = pd.read_csv("responses/final_data.csv")

final_df.head()

final_df_subset = final_df[["Input.dim1","Input.dim2","Input.url1","Input.url2"]]
final_df_subset.head()

# Take all urls from the final_df_subset and put them in a list
all_urls = []
all_dims = []
for i in range(len(final_df_subset)):
    all_dims.append(final_df_subset.iloc[i,0])
    all_dims.append(final_df_subset.iloc[i,1])
    all_urls.append(final_df_subset.iloc[i,2])
    all_urls.append(final_df_subset.iloc[i,3])


has_been_seen = []

mappings = {
    'argumentId': [],
    'argumentText': [],
    'argumentType': []

}

controls = []

for i in range(len(all_urls)):
    if all_urls[i] in has_been_seen:
        continue
    else:
        has_been_seen.append(all_urls[i])

        if "control" in all_dims[i]:
            mappings['argumentId'].append(len(url_to_id) + url_to_id_control[all_urls[i]])
            # find the control number in the url
            control_number = all_dims[i].split("control")[1]
            with open(f"control_samples/control_{control_number}.txt", "r") as f:
                control_text = f.read()
            mappings['argumentText'].append(control_text)
            mappings['argumentType'].append("control")

        else:
            mappings['argumentId'].append(url_to_id[all_urls[i]])
            mappings['argumentText'].append(id_to_text[url_to_id[all_urls[i]]])
            mappings['argumentType'].append(all_dims[i])
            

mappings_df = pd.DataFrame(mappings)
mappings_df.info()

# Sort the dataframe by argumentId
mappings_df = mappings_df.sort_values(by=['argumentId'])

mappings_df.head()

# If the last part of the argumentText is ____________________, then remove it
mappings_df['argumentText'] = mappings_df['argumentText'].str.replace("____________________", "")

mappings_df.to_csv("id_mappings.csv", index=False)

# Save to json
mappings_json = mappings_df.to_json("id_mappings.json", orient="records")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   argumentId    167 non-null    int64 
 1   argumentText  167 non-null    object
 2   argumentType  167 non-null    object
dtypes: int64(1), object(2)
memory usage: 4.0+ KB


In [29]:
df = pd.read_csv("responses/final_data.csv")
cols = ['Title', 'Description', 'Keywords', 'CreationTime', 'AssignmentId', 'WorkerId', 'AssignmentStatus', 'SubmitTime', 'WorkTimeInSeconds', 'LifetimeApprovalRate', 'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.url1', 'Input.url2', 'Answer.equal.label']
df = df[cols]
# df.head()

controls_ = 0

# Iterate over the rows in the dataframe and select the values in Input.url1 and Input.url2
for i in range(len(df)):
    url1 = df.iloc[i, 12]
    url2 = df.iloc[i, 13]




    # Get the id of the url, make sure to check if it is a control
    # If it is a control, then add the length of the url_to_id dictionary to the id
    # If it is not a control, then just get the id from the url_to_id dictionary
    if "control" in url1:
        id1 = len(url_to_id) + url_to_id_control[url1]
    else:
        id1 = url_to_id[url1]

    if "control" in url2:
        id2 = len(url_to_id) + url_to_id_control[url2]
    else:
        id2 = url_to_id[url2]


    # Replace Input.url1 and Input.url2 with the ids
    df.iloc[i, 12] = id1
    df.iloc[i, 13] = id2

# Rename the columns
df = df.rename(columns={"Input.url1": "argumentId1", "Input.url2": "argumentId2"})

# Remove these columns: CreationTime, AssignmentStatus, LifetimeApprovalRate, Last30DaysApprovalRate, Last7DaysApprovalRate
df = df.drop(columns=["CreationTime", "AssignmentStatus", "LifetimeApprovalRate", "Last30DaysApprovalRate", "Last7DaysApprovalRate"])

# Save the dataframe to a csv file
df.to_csv("final_data_mapped.csv", index=False)


In [3]:
import pandas as pd

ddf = pd.read_csv("id_mappings.csv")
ddf.tail()

Unnamed: 0,argumentId,argumentText,argumentType
162,163,I understand that you are skeptical towards cl...,control
163,164,The scientific consensus supports the reality ...,control
164,165,Climate change is a real and serious issue tha...,control
165,166,I understand that you may be skeptical towards...,control
166,167,The Earth has gone through natural changes bef...,control
