# Process MTurk Results

Performs a basic analysis of mTurk results for any results file.

In [None]:
import pandas as pd
import statistics
from collections import defaultdict
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

myth_name = str()
sample_size = str()
date = str()

file_name = f'../data/samples/{myth_name}/myth_{myth_name}_sample_{sample_size}_{date}-results.csv'

df = pd.read_csv(file_name)
list(df)

In [None]:
for col in list(df):
    if col.startswith('Input.full_text'):
        full_text_col_name = col
        break
df['text_len'] = df[full_text_col_name].apply(len)
df.plot(kind='scatter',x='text_len',y='WorkTimeInSeconds',color='blue')

In [None]:
# Group by WorkerID, and take the average time to complete to look
# for outliers that may have completed the tasks too fast.
df.groupby('WorkerId')['WorkTimeInSeconds']\
  .agg(['mean', 'count'])\
  .sort_values(by='mean')\
  .plot(subplots=True, kind='bar', title='MTurk Worker Statistics')

In [None]:
# The radio buttons from the MTurk form store their boolean result in a
# single column; we will combine these into their respective questions.

null_task_ids = set()

def merge_radios(row, topic):
    """
    To be used in an `apply` method to combine boolean radio buttons into a
    single column
    """
    if row['Answer.{}_yes.on'.format(topic)]:
        return 'yes'
    elif row['Answer.{}_no.on'.format(topic)]:
        return 'no'
    elif row['Answer.{}_unsure.on'.format(topic)]:
        return 'unsure'
    elif 'Answer.{}_broken_links.on'.format(topic) in list(df) and row['Answer.{}_broken_links.on'.format(topic)]:
        return 'broken_links'
    else:
        if topic == 'myth_supports':
            # Because this question is conditonal, some tasks might not have this field
            # Assume they are NO
            return 'no'
        
        raise ValueError("The chosen choice is not defined.") # If the worker didn't choose any choices
        # print(row.AssignmentId)
        # null_task_ids.add(row.HITId)
        # return None
    
df['is_myth'] = df.apply (lambda row: merge_radios(row, 'myth'), axis=1)
df['is_myth_supports'] = df.apply (lambda row: merge_radios(row, 'myth_supports'), axis=1)

print('There are {} tasks ids with null values'.format(len(null_task_ids)))

In [None]:
# Drop tasks with null
df = df[~df['HITId'].isin(list(null_task_ids))]

In [None]:
# Drop all tasks performed by rejected workers
rejected_workers = ["AAXYYH9MI3PJM"]

df = df[~df['WorkerId'].isin(rejected_workers)]

df.shape

In [None]:
# is_myth Answer Statistics

def check_agree_on(col):
    # Group the results by Task ID and their answer to the gun_violence question, then
    # count the number of records in those groups. This determines how many answers
    # there were per choice, per task. Rename the count column.
    df_gun_violence_counts = df.groupby(['HITId', col]).size().to_frame().reset_index()
    df_gun_violence_counts.rename(columns={0: 'count'}, inplace=True)

    # All three workers answered 'unsure'
    unsure = df_gun_violence_counts[col] == 'unsure'
    three = df_gun_violence_counts['count'] >= 2
    num_all_unsure = len(df_gun_violence_counts[unsure & three])

    # All three workers answered 'yes'
    yes = df_gun_violence_counts[col] == 'yes'
    num_all_yes = len(df_gun_violence_counts[yes & three])

    # All three workers answered 'no'
    no = df_gun_violence_counts[col] == 'no'
    num_all_no = len(df_gun_violence_counts[no & three])

    print('Agreed on "yes": {}\nAgreed on "No": {}\nAgreed on "Unsure": {}'.format(num_all_yes, num_all_no, num_all_unsure))

    # There was no majority, answers were 'yes', 'no', and 'unsure'
    df_gun_violence_count_size = df_gun_violence_counts.groupby('HITId').size().to_frame().rename(columns={0: 'count'})
    # df_gun_violence_count_size[df_gun_violence_count_size['count'] == 3]

In [None]:
check_agree_on(col="is_myth")

In [None]:
check_agree_on(col="is_myth_supports")

## Check data validity

In [None]:
# These columns must not be None
answer_cols = [ e for e in list(df) if e.startswith("Answers.") ]
for col in answer_cols:

    for v in df[col]:
        if v is None:
            raise ValueError('None exists in {}'.format(col))

In [None]:
# Check that each task has three rated values
rater_num = int() # Declare rater number
for v in df.groupby('HITId').size():
    if v != rater_num:
        raise ValueError("There is a task with raters not equal to {}, found {}.".format(rater_num, v))
print('Every task has {} workers for each'.format(rater_num))

# Agreement Computation

## 1. Task-based agreement

In [None]:
def get_task_based_rating(df, answer_col, rater_num=5):
    """
    Task-based rating score computation
    
    Args:
        df:
            A dataFrame with columns ['WorkerId', 'HITId', 'is_parenting'] ordered by ['HITId', 'WorkerId']
        answer_col:
            Column name to compute rating scores
        rater_num:
            Number of raters for each task
            
    Return:
        A list of task-based rating scores
    """
    
    rating_scores = []
    for i in range(0, df.shape[0], rater_num):
        if len(set(df.iloc[i:i + rater_num]['HITId'])) != 1:
            raise ValueError('Each task must contains {} rates, wrong at {}'.format(rater_num, i))
        answers = df.iloc[i:i + rater_num][answer_col].tolist()
        
        # Get number of majority
        majority_num = max(df.iloc[i:i + rater_num].groupby(answer_col).size())
        # Store rating score
        rating_scores.append(round(float(majority_num / rater_num), 2))
        
    return rating_scores

In [None]:
# Gun_violence
def check_task_based_score(col):
    df_task_based = df[['WorkerId', 'HITId', col]].sort_values(by=['HITId', 'WorkerId'])

    # DataFrame for rating scores
    df_task_based_rates = df_task_based.drop_duplicates(subset=['HITId'])
    df_task_based_rates.drop(['WorkerId', col], axis=1, inplace=True)

    # Compute scores
    rating_scores = get_task_based_rating(df_task_based, answer_col=col, rater_num=rater_num)
    df_task_based_rates = df_task_based_rates.assign(rating_scores=rating_scores)

    # Sort and check statistics
    df_task_based_rates.sort_values(by=['rating_scores'], inplace=True)

    # Check stats
    stats_task_based_gun_violence = df_task_based_rates.groupby('rating_scores').size().to_frame().reset_index().rename(columns={0: 'count'})
    ax = stats_task_based_gun_violence.plot(kind='bar', x='rating_scores', y='count', color='blue', \
                   title='{}: stats of task-based agreement'.format(col))

    for i, v in enumerate(stats_task_based_gun_violence['count']):
        ax.text(i-0.05, v+5, str(v), va='center', fontsize=10, fontweight='bold')
        
    df_gun_violence_task_based_rates = df_task_based_rates.sort_values(by=['rating_scores'], ascending=False)
    print(df_gun_violence_task_based_rates.head(10))
    
    only_high_task_based_tweet_ids = df_gun_violence_task_based_rates[df_gun_violence_task_based_rates['rating_scores'] >= 0.8]['HITId'].values
    print("There are {} tweets with high task-based scores".format(only_high_task_based_tweet_ids.shape[0]))
    
    avg_score = statistics.mean(df_gun_violence_task_based_rates['rating_scores'].values)
    print("Avg task-based score for {} = {}".format(col, avg_score))

In [None]:
check_task_based_score(col='is_myth')

In [None]:
check_task_based_score(col='is_myth_supports')

## 2. Worker-based agreement

Compute the score by comparing each task of each worker to the other. If an answer of a worker is equal to its majority of the corresponding task, then the number of correct answers of the work increases by one. The avg_score of each worker is computed by dividing the number of correct answers by total number of tasks that the worker have done.

In [None]:
def get_worker_based_rating(df, answer_col):
    """
    Worker-based rating score computation
    
    Args:
        df:
            A dataFrame with columns ['WorkerId', 'HITId', 'is_parenting'] ordered by ['WorkerId', 'HITId']
        answer_col:
            Column name to compute rating scores
            
    Return:
        A list of worker-based rating scores
    """
    df = df.reset_index(drop=True)
    
    # Variables for rating score computation
    current_woker_id = ""
    total_task_num = 0
    correct_answer_num = 0
    
    rating_scores = []
    total_task_nums = []
    for idx, row in df.iterrows():
        worker_id = row['WorkerId']
        task_id = row['HITId']
        answer = row[answer_col]
        
        # Get next worker ID
        if worker_id != current_woker_id:
            current_woker_id = worker_id
            
            if idx > 0:
                # Compute rating scores
                rating_scores.append(correct_answer_num / total_task_num)
                total_task_nums.append(total_task_num)

                # Reset variables
                total_task_num = 0
                correct_answer_num = 0
            
        # Check whether the answer is the same as majority answer
        df_answer_counts = df[df['HITId'] == task_id].groupby(answer_col).size().to_frame().rename(columns={0: 'count'}).sort_values(by=['count'], ascending=False).reset_index()
        majority_count = df_answer_counts.iloc[0]['count']
        
        try:
            answer_count = df_answer_counts[df_answer_counts[answer_col] == answer].iloc[0]['count']
        except Exception as e:
            print(e)
            print(df[df['HITId'] == task_id])
        
        if answer_count == majority_count and majority_count > 1:
            is_majority = True
        else:
            is_majority = False
        
        # Counting if the answer is the same as the majority vote
        if is_majority:
            correct_answer_num += 1
        total_task_num += 1
        
        # Last task
        if idx + 1 > max(df.index):
            # Compute rating scores
            rating_scores.append(correct_answer_num / total_task_num)
            total_task_nums.append(total_task_num)
    
    # print(len(set(df['WorkerId'].values)))
    # print("Rating score: " + str(len(rating_scores)))
    # print("Total task number: " + str(len(total_task_nums)))
    
    return rating_scores, total_task_nums

In [None]:
# Gun violence
def check_worker_based_score(col):
    df_worker_based = df[['WorkerId', 'HITId', col]].sort_values(by=['WorkerId', 'HITId'])

    df_worker_based_rates = df_worker_based.drop_duplicates(subset=['WorkerId'])
    df_worker_based_rates.drop(['HITId', col], axis=1, inplace=True)

    # # Compute scores
    rating_scores, total_task_nums = get_worker_based_rating(df_worker_based, answer_col=col)
    df_worker_based_rates = df_worker_based_rates.assign(rating_scores=rating_scores)
    df_worker_based_rates = df_worker_based_rates.assign(total_task_nums=total_task_nums)

    # Filter workers that have done only one task
    df_worker_based_rates = df_worker_based_rates[df_worker_based_rates['total_task_nums'] > 1]

    # Check stats
    stats_worker_based_gun_violence = pd.cut(df_worker_based_rates['rating_scores'].values, \
                       bins=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], \
                       include_lowest=True).value_counts()
    stats_worker_based_gun_violence.plot(kind='bar', x='rating_scores', y='count', color='blue', \
                   title='{}: stats of worker-based agreement\n({} workers)'.format(col, df_worker_based_rates.shape[0]))
    
    df_gun_violence_worker_based_rates = df_worker_based_rates.sort_values(by=['rating_scores'], ascending=False)

    avg_score = statistics.mean(df_gun_violence_worker_based_rates['rating_scores'].values)
    
    print("Avg worker-based score for {} = {}".format(col, avg_score))
    
    done_enough_gun_violence_worker_ids = df_gun_violence_worker_based_rates[(df_gun_violence_worker_based_rates['rating_scores'] >= 0.5)]\
['WorkerId'].values
    print("There are {} good workers".format(len(done_enough_gun_violence_worker_ids)))
    
    bad_gun_violence_worker_ids = df_gun_violence_worker_based_rates[(df_gun_violence_worker_based_rates['rating_scores'] < 0.5)]\
['WorkerId'].values
    print("There are {} bad workers".format(len(bad_gun_violence_worker_ids)))
    print(bad_gun_violence_worker_ids)

In [None]:
check_worker_based_score("is_myth")

In [None]:
check_worker_based_score("is_myth_supports")

## 3 Alpha agreement score

In [None]:
from nltk.metrics import agreement

data = df[['WorkerId', 'HITId', 'is_myth']].values

data = [ e for e in data if e[2] != 'unsure' ]

rating = agreement.AnnotationTask(data=data)

#print("kappa " + str(rating.kappa()))
#print("fleiss " + str(rating.multi_kappa()))
print("alpha " + str(rating.alpha()))
#print("scotts " + str(rating.pi()))

In [None]:
data = df[['WorkerId', 'HITId', 'is_myth_supports']].values

data = [ e for e in data if e[2] != 'unsure' ]

rating = agreement.AnnotationTask(data=data)

#print("kappa " + str(rating.kappa()))
#print("fleiss " + str(rating.multi_kappa()))
print("alpha " + str(rating.alpha()))
#print("scotts " + str(rating.pi()))

In [None]:
worker_ids = list(set(df['WorkerId']))
print('There are {} raters'.format(len(worker_ids)))

task_ids = list(set(df['HITId']))
print('There are {} tweets'.format(len(task_ids)))

In [None]:
df[['is_myth']].groupby('is_myth').size()

In [None]:
df[['is_myth_supports']].groupby('is_myth_supports').size()