### Setup

In [35]:
import pandas as pd
import json

import numpy as np

### Load Data

---

In [7]:
# Load categories.json from data/crowdsourced
with open('../data/crowdsourced/categories.json') as f:
    categories = json.load(f)

# Load labeled.csv from data/crowdsourced
labeled = pd.read_csv('../data/crowdsourced/labeled.csv')

### Clean Data

---

Let's start with some basic EDA:

In [99]:
# Show what was the title and description of the task
print(f"Task title: {labeled['Title'][0]}")
print(f"Task description: {labeled['Description'][0]}")
print(f"Task reward: {labeled['Reward'][0]}")
print("-"*50)

# Make sure that for all records, AssignmentStatus is Approved
assert len(labeled['AssignmentStatus'].unique()) == 1 and labeled['AssignmentStatus'].unique()[0] == 'Approved', 'AssignmentStatus is not Approved'
print('✅ All records have AssignmentStatus Approved')

# Confirm that all pages are assigned with at most 3 assignments
max_assignments, count = np.unique(labeled['MaxAssignments'], return_counts=True)
assert len(max_assignments) == 1 and max_assignments[0] == 3, 'MaxAssignments is not 3'
print("✅ This checks with the max assignments allowed.")

# Number of records per unique page
number_of_labels, count = np.unique(labeled['Input.uid'].value_counts(), return_counts=True)
for numlabels, c in zip(number_of_labels, count):
    print(f'🔎 There are {c} websites each annotated by {numlabels} labelers')

# Show the unique responses for each question
answers = set()
for answer in labeled['Answer.taskAnswers']:
    parsed_answer = json.loads(answer)
    answers.update([v for v in parsed_answer[0].values() if type(v) == str])
print(f'🔎 There are {len(answers)} unique responses: {answers}')

# Average number of labels per user
avg_user_labels = labeled['WorkerId'].value_counts().mean()
print(f'🔎 On average each labeler annotated {avg_user_labels} pages')

# Check missing values for Input.url, Input.screenshot, Input.title, Input.description, report in percentage
for col in ['Input.url', 'Input.screenshot', 'Input.title', 'Input.description']:
    miss_vals = labeled[col].isna().sum() / len(labeled) * 100
    if miss_vals > 0:
        print(f'❗️ {col} has {miss_vals:.2f}% missing values')

# Get unique answers
answers = set()
total = 0
for answer in labeled['Answer.taskAnswers']:
    parsed_answer = json.loads(answer)
    if len(parsed_answer) > 1:
        total += 1
if total > 0:
    print(f'❗️ There are {total} records with taskAnswers list length > 1')
else:
    print('✅ All records has taskAnswers list length = 1')

Task title: Select all categories that are relevant for the website (English websites)
Task description: Given a screenshot, title, and description of a website, select all the relevant categories.
Task reward: $0.10
--------------------------------------------------
✅ All records have AssignmentStatus Approved
✅ This checks with the max assignments allowed.
🔎 There are 840 websites each annotated by 3 labelers
🔎 There are 3 unique responses: {'NO', 'UNSURE', 'YES'}
🔎 On average each labeler annotated 60.0 pages
❗️ Input.title has 2.86% missing values
❗️ Input.description has 46.19% missing values
✅ All records has taskAnswers list length = 1


Next, let's one hot encode the column `Answer.taskAnswers` based on the dictionary that each row includes:

In [None]:
# Create idx2cat and cat2idx mappings
idx2cat = {idx : categories[idx]['name'] for idx in categories}
cat2idx = {categories[idx]['name'] : idx for idx in categories}

# Create a new column for the category name
eval_answer = lambda x: True if x == 'Yes' else False
labeled['categories'] = labeled['Answer.taskAnswers'].apply(lambda x: idx2cat[x])