In [None]:
import os
import json
import boto3
import pprint
import html
import pytz
import xmltodict

import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId

import scipy.stats
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET

from pytz import timezone
from datetime import datetime

from requests.models import PreparedRequest
from xml.etree.ElementTree import Element, SubElement, tostring
from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
 
from matplotlib.backends.backend_pdf import PdfPages

pp = pprint.PrettyPrinter( indent=4 )

URL_MTURK_SANDBOX    = os.environ["URL_MTURK_SANDBOX"]
URL_MTURK_PRODUCTION = os.environ["URL_MTURK_PRODUCTION"]
IAM_USER_ACCESS_KEY  = os.environ["IAM_USER_ACCESS_KEY"]
IAM_USER_SECRET_KEY  = os.environ["IAM_USER_SECRET_KEY"]
MTURK_REGION_NAME    = os.environ["MTURK_REGION_NAME"]
URL_MONGO            = os.environ["URL_MONGO"]

db     =  pymongo.MongoClient(URL_MONGO).get_default_database()

# URL_HIT_REMOTE = 'https://hansinky.herokuapp.com/rec_serv'
# URL_HIT_LOCAL  = 'http://0.0.0.0:5000/rec_serv'

# URL_HIT             = URL_HIT_REMOTE
# ENVIRONMENT         = URL_MTURK_PRODUCTION

In [None]:
# Getting the mTurk Client

client = boto3.client('mturk',
   aws_access_key_id = IAM_USER_ACCESS_KEY,
   aws_secret_access_key = IAM_USER_SECRET_KEY,
   region_name = MTURK_REGION_NAME,
   endpoint_url = ENVIRONMENT
)
print ("I have $" + client.get_account_balance()['AvailableBalance'] + " in my Sandbox account")

In [None]:
# This block show the list of the HITS presend in MTurk and the relative Status

hits = client.list_hits(MaxResults= 100)

print("{} HITs found".format(len(hits['HITs'])))

for i, hit in enumerate(hits['HITs']):
    print( '\nHIT {}: {} (created: {})\n'.format(i, hit['HITId'], hit['CreationTime']) )
    print('  > Status: {}'.format( hit['HITStatus']))
    print('  > Available: {}'.format(  hit['NumberOfAssignmentsAvailable']) )
    print('  > Completed: {}'.format(  hit['NumberOfAssignmentsCompleted']) )
    print('  > Pending: {}'.format(  hit['NumberOfAssignmentsPending']) )
    
#     pp.pprint(hit)

In [None]:
# This block iterate a list of HIT ids and download the results of all of them. Then, such results 
# are converted in a Pandas Dataframe (namely df), to facilitate further analysis. The data are then 
# stored in the 'rec_serv_pilot_results.csv' file.

# This is the list of HITs of which to dowanload the results is LIST_OF_HIT_ID. 
# E.g., LIST_OF_HIT_ID = ['3BPP3MA3TCKLIJ9JWTZGPV9UR8PELG', '3CO05SML7V5XURP2T6XXQIGP3ZIR0B']
LIST_OF_HIT_ID = []



df = pd.DataFrame(columns=['HITId', 'AssignmentId', 'Company_id', 'WorkerId', 'Num. Company', 'Answer', 'Confidence', 'No enough info', 'SubmitTime'])

for HITId in LIST_OF_HIT_ID:
    
    assignments = client.list_assignments_for_hit( HITId = HITId)
    
    print('\nProcessing HIT {}, that has {} assignments:\n'.format( HITId, len(assignments['Assignments']) ))

    for assignment in assignments['Assignments']:
    
#         HITId        = assignment['HITId']
        WorkerId     = assignment['WorkerId']
        SubmitTime   = assignment['SubmitTime']
        AssignmentId = assignment['AssignmentId']
        
        print( '  - {}'.format(AssignmentId) )
    
        xml_doc = xmltodict.parse(assignment['Answer'])

        answers = {}

        # Multiple fields in HIT layout
        for answer_field in xml_doc['QuestionFormAnswers']['Answer']:
            answers[answer_field['QuestionIdentifier']] = answer_field['FreeText']

        for i in range(1,6):

            id_company = answers[ 'company_{}'.format(i) ] 
            # print( 'Company {}: {}'.format(i, id_company))
            answer      = answers[ 'comp_number_{}'.format(i) ] if 'comp_number_{}'.format(i) in answers.keys() else None
            confidence  = answers[ 'confidence_{}'.format(i) ] if 'confidence_{}'.format(i) in answers.keys() else None
            cb          = True if 'cb_{}'.format(i) in answers.keys() else False

            df.loc[len(df)] = [ HITId, AssignmentId, id_company, WorkerId, i, answer, confidence, cb, str( SubmitTime )  ]

      
df.set_index( ['HITId', 'AssignmentId', 'WorkerId', 'Company_id'] ,inplace=True)    

companies_found = df.index.unique(level='Company_id')

res = list( db.companies.find( { '_id' : { '$in' : [ ObjectId(c) for c in companies_found  ] } } ) )

dict_companies = {}
for company in res:
    dict_companies[ str( ObjectId(company['_id'])) ] = company['opencorporates_id']

def apply_company_id(v):
    return dict_companies[ v.name[3]]

df['OriginalMatch'] = df.apply(apply_company_id, axis=1)
df.replace( {'complitely_confident': 2, 'moderately_confident': 1 , 'slightly_confident': 0  }  , inplace = True)

df.to_csv( 'rec_serv_pilot_results.csv', header=True)        
    

In [None]:
# Show the results downloadeed stored ad Pandas Dataframe

display(df)

In [None]:
# The row resulta stored in 'df' are then aggregated through Majority Voting technique, that means that among
# the n=10 'company_number' obtained by different workers, we keep the most frequent one. Results of the 
# aggregation process goes in an another Pandas DataFrame named aggregated. Aggregated results 
# are then stored in the 'rec_serv_pilot_results_aggregated_mv.csv' file.

df.drop( ['SubmitTime'],axis=1)

def func_majority_voting(x):
        
    original_match = x['OriginalMatch'].values[0]
    
    confidence = np.round( np.mean( [ v for v in x['Confidence'].values if np.isnan(v) == False]),2)
       
    values = [ v for v in x['Answer'].values if v!= None]
    
    u, count = np.unique( values, return_counts=True  ) 
    
    u     = u[np.argsort(-count)]
    count = count[np.argsort(-count)]
    
    return   pd.Series( [ u[0], count[0], confidence, original_match ])    


# aggregated = df.groupby( level=[0,3] )['Answer','Confidence', 'No enough info', 'OriginalMatch'].apply(lambda x: x.mode().iloc[0])
df_aggregated = pd.DataFrame()
df_aggregated[['Answer', 'Workers', 'Avg. Workers Conf.', 'OriginalMatch']] = df.groupby( level=[0,3] )['Answer','Confidence', 'No enough info', 'OriginalMatch'].apply( func_majority_voting)

df_aggregated.to_csv( 'rec_serv_pilot_results_aggregated_mv.csv', header=True)

display(df_aggregated)




In [None]:
# This block generates the chart of the number of workers who opted for the most frequent company number. 
# Such chart is then saved in 'Worker_Agreement.pdf' file

import seaborn as sns

pages = PdfPages('Worker_Agreement.pdf')

sns.set(font_scale=1.2)
sns.set_style('whitegrid')


g = sns.catplot(x="Answer", y="Workers", data=df_aggregated, height=4, kind="bar", palette="muted", aspect=2)
g.despine(left=True)
g.set(ylim=(0, 10))
g.set_ylabels("No. workers who agreed\non the Company Number")
g.set_xlabels("Company Number")
g.set_xticklabels(rotation=30)
pages.savefig(bbox_inches='tight')
pages.close()

In [None]:
# This block generates a chart that shows che distriburion of confidence given by the workers 
# to the 10 companies, in a range that goes from 0 (Slightly confident) to 2 (Completely confident). 
# The chart is stored in the 'Worker_Confidence_Distribution.pdf' file.

import seaborn as sns
import matplotlib.pyplot as plt
fig = plt.figure()

pages = PdfPages('Worker_Confidence_Distribution.pdf')

sns.set(font_scale=1.2)
sns.set_style('whitegrid')

bins = np.linspace(0,2,21)

g = sns.distplot( aggregated["Avg. Workers Conf."], bins = bins )

g.set_ylabel('No. of companies')
g.set_xlabel('\n\n\n Mean workers confidence per company ')

g.set_xlim(-0.1,2.1)
g.set_ylim(-0.15,3.15)

g.text(0, -1,'Slightly\nconfident', ha ='center', fontsize=13 )
g.text(1, -1,'Moderately\nconfident', ha ='center', fontsize=13 )
g.text(2, -1,'Completely\nconfident', ha ='center', fontsize=13 )

plt.yticks(range(0, 4, 1))

pages.savefig(bbox_inches='tight')
pages.close()