In [1]:
import pandas as pd
import numpy as np

from collections import Counter

Import dataset

In [2]:
df = pd.read_csv('tribe_dynamics_data.csv')
df.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,brand_id,worker_id,mturker,post_hash,answer,date,duration_seconds,text,model_decision,timestamped_model,lang
0,0,0,18792,TRIBE_103_allayne.low@gmail.com,False,3ca62dcea583b7aa204fc52fe6b2826c,False,2017-07-16,,・\nケイト スペード ニューヨークの\n2017Fall Collectionに招待してい...,False,,ja


Drop the first two columns that are completely unnecessary

In [3]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
df.head(1)

Unnamed: 0,brand_id,worker_id,mturker,post_hash,answer,date,duration_seconds,text,model_decision,timestamped_model,lang
0,18792,TRIBE_103_allayne.low@gmail.com,False,3ca62dcea583b7aa204fc52fe6b2826c,False,2017-07-16,,・\nケイト スペード ニューヨークの\n2017Fall Collectionに招待してい...,False,,ja


Perfect duplicated rows (when True, it is duplicated)

In [4]:
Counter(df.duplicated())

Counter({False: 16142, True: 607})

Keep only relevant columns for the accuracy analysis

In [5]:
df_reduced = df[['answer', 'model_decision', 'lang', 'mturker']]
df_reduced.head(5)

Unnamed: 0,answer,model_decision,lang,mturker
0,False,False,ja,False
1,False,False,ja,False
2,False,False,ja,False
3,False,False,ja,False
4,False,False,ja,False


Check for null values -> There are null values in the languages column

In [6]:
df_reduced.answer.isnull().values.any()

False

In [7]:
df_reduced.model_decision.isnull().values.any()

False

In [8]:
df_reduced.lang.isnull().values.any()

True

In [9]:
df_filled = df_reduced.fillna('NULL_lang')

In [10]:
arr = df_filled.values
arr

array([[False, False, 'ja', False],
       [False, False, 'ja', False],
       [False, False, 'ja', False],
       ..., 
       [True, False, 'en', True],
       [False, False, 'en', True],
       [False, False, 'ja', False]], dtype=object)

In [11]:
y_true = arr[:,0].astype(np.bool)
y_true

array([False, False, False, ...,  True, False, False], dtype=bool)

In [12]:
y_pred = arr[:,1].astype(np.bool)
y_pred

array([False, False, False, ..., False, False, False], dtype=bool)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.93539912830616756

In [14]:
df_filled['lang'].values

array(['ja', 'ja', 'ja', ..., 'en', 'en', 'ja'], dtype=object)

In [15]:
Counter(df_filled['lang'].values)

Counter({'NULL_lang': 24,
         'ar': 151,
         'ca': 3,
         'de': 521,
         'en': 9426,
         'es': 1137,
         'fr': 615,
         'id': 3,
         'it': 469,
         'ja': 2480,
         'ko': 246,
         'nl': 18,
         'pl': 4,
         'pt': 579,
         'ro': 10,
         'ru': 251,
         'sv': 4,
         'th': 202,
         'tr': 152,
         'vi': 3,
         'zh': 451})

In [16]:
languages = np.unique(df_filled['lang'].values)
languages

array(['NULL_lang', 'ar', 'ca', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja',
       'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'th', 'tr', 'vi', 'zh'], dtype=object)

In [17]:
def language_accuracy(df, language):
    language_df = df[df.lang == language]
    arr = language_df.values
    y_true = arr[:,0].astype(np.bool)
    y_pred = arr[:,1].astype(np.bool)
    print("Language: %s\t\tNumber of instances: %d\t\tAccuracy=%.3f" % (language, len(y_true), accuracy_score(y_true, y_pred)))

In [18]:
for lang in languages:
    language_accuracy(df_filled, lang)

Language: NULL_lang		Number of instances: 24		Accuracy=0.917
Language: ar		Number of instances: 151		Accuracy=0.987
Language: ca		Number of instances: 3		Accuracy=1.000
Language: de		Number of instances: 521		Accuracy=0.944
Language: en		Number of instances: 9426		Accuracy=0.959
Language: es		Number of instances: 1137		Accuracy=0.966
Language: fr		Number of instances: 615		Accuracy=0.977
Language: id		Number of instances: 3		Accuracy=1.000
Language: it		Number of instances: 469		Accuracy=0.983
Language: ja		Number of instances: 2480		Accuracy=0.834
Language: ko		Number of instances: 246		Accuracy=0.801
Language: nl		Number of instances: 18		Accuracy=1.000
Language: pl		Number of instances: 4		Accuracy=1.000
Language: pt		Number of instances: 579		Accuracy=0.981
Language: ro		Number of instances: 10		Accuracy=0.700
Language: ru		Number of instances: 251		Accuracy=0.884
Language: sv		Number of instances: 4		Accuracy=1.000
Language: th		Number of instances: 202		Accuracy=0.837
Language: t

In [19]:
def mturk_accuracy(df, language, isAmazon):
    language_df = df[(df.lang == language) & (df.mturker == isAmazon)]
    arr = language_df.values
    y_true = arr[:,0].astype(np.bool)
    y_pred = arr[:,1].astype(np.bool)
    print("Language: %s\t\tNumber of instances: %d\t\tAccuracy=%.3f" % (language, len(y_true), accuracy_score(y_true, y_pred)))

In [24]:
len(df[df.mturker == True])
print

16208

In [25]:
len(df[df.mturker == False])

541

In [20]:
print("Ground truth label AMAZON MTURK")
for lang in languages:    
    mturk_accuracy(df_filled, lang, isAmazon=True)

Ground truth label AMAZON MTURK
Language: NULL_lang		Number of instances: 24		Accuracy=0.917
Language: ar		Number of instances: 151		Accuracy=0.987
Language: ca		Number of instances: 3		Accuracy=1.000
Language: de		Number of instances: 521		Accuracy=0.944
Language: en		Number of instances: 9288		Accuracy=0.959
Language: es		Number of instances: 1133		Accuracy=0.966
Language: fr		Number of instances: 614		Accuracy=0.977
Language: id		Number of instances: 3		Accuracy=1.000
Language: it		Number of instances: 469		Accuracy=0.983
Language: ja		Number of instances: 2191		Accuracy=0.812
Language: ko		Number of instances: 226		Accuracy=0.783
Language: nl		Number of instances: 18		Accuracy=1.000
Language: pl		Number of instances: 4		Accuracy=1.000
Language: pt		Number of instances: 579		Accuracy=0.981
Language: ro		Number of instances: 10		Accuracy=0.700
Language: ru		Number of instances: 251		Accuracy=0.884
Language: sv		Number of instances: 4		Accuracy=1.000
Language: th		Number of instances:

In [21]:
print("Ground truth label TRIBE EMPLOYEE")
for lang in languages:
    mturk_accuracy(df_filled, lang, isAmazon=False)

Ground truth label TRIBE EMPLOYEE
Language: NULL_lang		Number of instances: 0		Accuracy=nan
Language: ar		Number of instances: 0		Accuracy=nan
Language: ca		Number of instances: 0		Accuracy=nan
Language: de		Number of instances: 0		Accuracy=nan
Language: en		Number of instances: 138		Accuracy=0.993
Language: es		Number of instances: 4		Accuracy=1.000
Language: fr		Number of instances: 1		Accuracy=1.000
Language: id		Number of instances: 0		Accuracy=nan
Language: it		Number of instances: 0		Accuracy=nan
Language: ja		Number of instances: 289		Accuracy=1.000
Language: ko		Number of instances: 20		Accuracy=1.000
Language: nl		Number of instances: 0		Accuracy=nan
Language: pl		Number of instances: 0		Accuracy=nan
Language: pt		Number of instances: 0		Accuracy=nan
Language: ro		Number of instances: 0		Accuracy=nan
Language: ru		Number of instances: 0		Accuracy=nan
Language: sv		Number of instances: 0		Accuracy=nan
Language: th		Number of instances: 12		Accuracy=1.000
Language: tr		Number of

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [26]:
df_amazon = df[df.mturker == True]
unique_amazon_workers = np.unique(df_amazon.worker_id.values)
len(unique_amazon_workers)

404