# Brief TribeDynamics data exploration

In [2]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.metrics import accuracy_score

# Load data

In [3]:
df = pd.read_csv('tribe_dynamics_data.csv')
df.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,brand_id,worker_id,mturker,post_hash,answer,date,duration_seconds,text,model_decision,timestamped_model,lang
0,0,0,18792,TRIBE_103_allayne.low@gmail.com,False,3ca62dcea583b7aa204fc52fe6b2826c,False,2017-07-16,,・\r\nケイト スペード ニューヨークの\r\n2017Fall Collectionに招...,False,,ja


##### Drop first two columns (unnecessary row identifiers)

In [4]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
df.head(1)

Unnamed: 0,brand_id,worker_id,mturker,post_hash,answer,date,duration_seconds,text,model_decision,timestamped_model,lang
0,18792,TRIBE_103_allayne.low@gmail.com,False,3ca62dcea583b7aa204fc52fe6b2826c,False,2017-07-16,,・\r\nケイト スペード ニューヨークの\r\n2017Fall Collectionに招...,False,,ja


### Check for duplicated rows

Perfect duplicated rows (when True, the row is completely duplicated)
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.duplicated.html

In [4]:
Counter(df.duplicated())

Counter({False: 16142, True: 607})

## Accuracy analysis

##### Keep only relevant columns for the accuracy analysis

In [5]:
df_reduced = df[['answer', 'model_decision', 'lang', 'mturker', 'text']]
df_reduced.head(3)

Unnamed: 0,answer,model_decision,lang,mturker
0,False,False,ja,False
1,False,False,ja,False
2,False,False,ja,False


###### Check for null values 
There are null values in the languages column

In [6]:
df_reduced.answer.isnull().values.any()

False

In [7]:
df_reduced.model_decision.isnull().values.any()

False

In [8]:
df_reduced.lang.isnull().values.any()

True

Replace null values in 'lang' field with NaN_lang

In [9]:
df_filled = df_reduced.fillna('NaN_lang')

### Present languages

In [33]:
Counter(df_filled['lang'].values)

Counter({'NaN_lang': 24,
         'ar': 151,
         'ca': 3,
         'de': 521,
         'en': 9426,
         'es': 1137,
         'fr': 615,
         'id': 3,
         'it': 469,
         'ja': 2480,
         'ko': 246,
         'nl': 18,
         'pl': 4,
         'pt': 579,
         'ro': 10,
         'ru': 251,
         'sv': 4,
         'th': 202,
         'tr': 152,
         'vi': 3,
         'zh': 451})

In [11]:
languages_dict = {'NaN_lang': 'NaN_language',
                  'ar': 'Arabic',
                  'ca': 'Catalan/Valencian',
                  'de': 'German',
                  'en': 'English',
                  'es': 'Spanish/Castilian',
                  'fr': 'French',
                  'id': 'Indonesian',
                  'it': 'Italian',
                  'ja': 'Japanese',
                  'ko': 'Korean',
                  'nl': 'Dutch/Flemish',
                  'pl': 'Polish',
                  'pt': 'Portuguese',
                  'ro': 'Romanian/Moldavian/Moldovan',
                  'ru': 'Russian',
                  'sv': 'Swedish',
                  'th': 'Thai',
                  'tr': 'Turkish',
                  'vi': 'Vietnamese',
                  'zh': 'Chinese'}

### Overall accuracy

In [12]:
arr = df_filled.values
y_true = arr[:,0].astype(np.bool)
y_pred = arr[:,1].astype(np.bool)
print("Number of instances: %d\t\tOverall accuracy=%.3f" % (len(y_true), accuracy_score(y_true, y_pred)))

Number of instances: 16749		Overall accuracy=0.935


### Per language accuracy

In [34]:
languages = np.unique(df_filled['lang'].values)
print("Number of distinct languages in the dataset (including Nan_lang): %d" % (len(languages)))

Number of distinct languages in the dataset (including Nan_lang): 21


In [49]:
total_posts = df_filled.shape[0] - 24 # Subtract missing values
def language_accuracy(df, language):
    language_df = df[df.lang == language]
    arr = language_df.values
    y_true = arr[:,0].astype(np.bool)
    y_pred = arr[:,1].astype(np.bool)
    print("Language: %s\t\tNumber of instances: %d (%.2f percentage)\t\tAccuracy=%d (%.3f percentage)" % 
          (language, len(y_true), len(y_true)/total_posts*100,accuracy_score(y_true, y_pred, normalize=False), accuracy_score(y_true, y_pred)))

In [50]:
for lang in languages:
    language_accuracy(df_filled, lang)

Language: NaN_lang		Number of instances: 24 (0.14 percentage)		Accuracy=22 (0.917 percentage)
Language: ar		Number of instances: 151 (0.90 percentage)		Accuracy=149 (0.987 percentage)
Language: ca		Number of instances: 3 (0.02 percentage)		Accuracy=3 (1.000 percentage)
Language: de		Number of instances: 521 (3.12 percentage)		Accuracy=492 (0.944 percentage)
Language: en		Number of instances: 9426 (56.36 percentage)		Accuracy=9040 (0.959 percentage)
Language: es		Number of instances: 1137 (6.80 percentage)		Accuracy=1098 (0.966 percentage)
Language: fr		Number of instances: 615 (3.68 percentage)		Accuracy=601 (0.977 percentage)
Language: id		Number of instances: 3 (0.02 percentage)		Accuracy=3 (1.000 percentage)
Language: it		Number of instances: 469 (2.80 percentage)		Accuracy=461 (0.983 percentage)
Language: ja		Number of instances: 2480 (14.83 percentage)		Accuracy=2068 (0.834 percentage)
Language: ko		Number of instances: 246 (1.47 percentage)		Accuracy=197 (0.801 percentage)
Langua

## MTurk vs TribeDynamics labeling

In [16]:
print("Instances labeled by MTurkers: %d" % (len(df[df.mturker == True])))

Instances labeled by MTurkers: 16208


In [17]:
print("Instances labeled by Tribe's employee: %d" % (len(df[df.mturker == False])))

Instances labeled by Tribe's employee: 541


In [18]:
df_amazon = df[df.mturker == True]
unique_amazon_workers = np.unique(df_amazon.worker_id.values)
print("Number of unique MTurk individuals: %d" % (len(unique_amazon_workers)))

Number of unique MTurk individuals: 404


In [19]:
df_tribe = df[df.mturker == False]
unique_tribe_workers = np.unique(df_tribe.worker_id.values)
print("Number of unique Tribe's empolyees: %d" % (len(unique_tribe_workers)))

Number of unique Tribe's empolyees: 3


In [20]:
def accuracy_per_labeler(df, language, isAmazon):
    language_df = df[(df.lang == language) & (df.mturker == isAmazon)]
    if language_df.shape[0]>0:
        arr = language_df.values
        y_true = arr[:,0].astype(np.bool)
        y_pred = arr[:,1].astype(np.bool)
        print("Language: %s\t\tNumber of instances: %d\t\tCorrectly classified posts: %d\t\tAccuracy=%.3f" % 
              (language, len(y_true), accuracy_score(y_true, y_pred, normalize=False), accuracy_score(y_true, y_pred)))
    else:
        print("Language %s has no instances for this case" % (language))

In [21]:
arr = df_filled[df_filled.mturker == True].values
y_true = arr[:,0].astype(np.bool)
y_pred = arr[:,1].astype(np.bool)
print("Overall accuracy when labelers are from MTurk = %.3f" % (accuracy_score(y_true, y_pred)))

Overall accuracy when labelers are from MTurk = 0.933


In [22]:
print("Accuracy per language when labelers are from Amazon MTurk:")
for lang in languages:    
    accuracy_per_labeler(df_filled, lang, isAmazon=True)

Accuracy per language when labelers are from Amazon MTurk:
Language: NaN_lang		Number of instances: 24		Correctly classified posts: 22		Accuracy=0.917
Language: ar		Number of instances: 151		Correctly classified posts: 149		Accuracy=0.987
Language: ca		Number of instances: 3		Correctly classified posts: 3		Accuracy=1.000
Language: de		Number of instances: 521		Correctly classified posts: 492		Accuracy=0.944
Language: en		Number of instances: 9288		Correctly classified posts: 8903		Accuracy=0.959
Language: es		Number of instances: 1133		Correctly classified posts: 1094		Accuracy=0.966
Language: fr		Number of instances: 614		Correctly classified posts: 600		Accuracy=0.977
Language: id		Number of instances: 3		Correctly classified posts: 3		Accuracy=1.000
Language: it		Number of instances: 469		Correctly classified posts: 461		Accuracy=0.983
Language: ja		Number of instances: 2191		Correctly classified posts: 1779		Accuracy=0.812
Language: ko		Number of instances: 226		Correctly classifie

In [23]:
arr = df_filled[df_filled.mturker == False].values
y_true = arr[:,0].astype(np.bool)
y_pred = arr[:,1].astype(np.bool)
print("Overall accuracy when labelers are Tribe's employees = %.3f" % (accuracy_score(y_true, y_pred)))

Overall accuracy when labelers are Tribe's employees = 0.998


In [24]:
print("Accuracy per language when labelers are Tribe's employees:")
for lang in languages:
    accuracy_per_labeler(df_filled, lang, isAmazon=False)

Accuracy per language when labelers are Tribe's employees:
Language NaN_lang has no instances for this case
Language ar has no instances for this case
Language ca has no instances for this case
Language de has no instances for this case
Language: en		Number of instances: 138		Correctly classified posts: 137		Accuracy=0.993
Language: es		Number of instances: 4		Correctly classified posts: 4		Accuracy=1.000
Language: fr		Number of instances: 1		Correctly classified posts: 1		Accuracy=1.000
Language id has no instances for this case
Language it has no instances for this case
Language: ja		Number of instances: 289		Correctly classified posts: 289		Accuracy=1.000
Language: ko		Number of instances: 20		Correctly classified posts: 20		Accuracy=1.000
Language nl has no instances for this case
Language pl has no instances for this case
Language pt has no instances for this case
Language ro has no instances for this case
Language ru has no instances for this case
Language sv has no instances for

# Plots

In [25]:
accuracy_list = []
for lang in languages:
    language_df = df_filled[df_filled.lang == lang]
    arr = language_df.values
    y_true = arr[:,0].astype(np.bool)
    y_pred = arr[:,1].astype(np.bool)
    accuracy_list.append(accuracy_score(y_true, y_pred) * language_df.shape[0])
accuracy_list = np.array(accuracy_list)

In [26]:
n_instances_list = []
for lang in languages:
    language_df = df_filled[df_filled.lang == lang]
    n_instances_list.append(language_df.shape[0])
n_instances_list = np.array(n_instances_list)

## Confusion matrix