In [1]:
import pandas as pd
import numpy as np
import codecs, json
from collections import Counter

In [2]:
#Selected brands: Lush ; La Roche-Posay ; Dove ; Vichy
brand_ids = ['13578', '12003','14680','12857']

In [3]:
# Columns to output on the final dataset
columns = ['text',
           'lang',
           'link',
           'mturker',
           'model_decision',
           'labels',
           'aggregated_label',
           'capstone_label']

In [4]:
# Extract all json files and store it in a list of lists
dict_list = []

for brand_id in brand_ids:
       
    # Open json files
    with codecs.open('../../Data/CSE_20180215/' + brand_id + '_data.json', 'r', 'utf-8') as f_data:
        dict_list.append(json.load(f_data, encoding='utf-8'))  

In [5]:
# Extract dataframes from the list of dictionaries
df_lush = pd.DataFrame.from_dict(dict_list[0])
df_roche = pd.DataFrame.from_dict(dict_list[1])
df_dove = pd.DataFrame.from_dict(dict_list[2])
df_vichy = pd.DataFrame.from_dict(dict_list[3])

In [6]:
# Create big dataframe as a concatenation of all of them
df_tot = pd.concat([df_lush, df_roche, df_dove, df_vichy])
 # Only keep posts from mturker
df_tot = df_tot[df_tot.mturker == 1]

print(df_tot.shape)
df_tot.head(5)

(14167, 6)


Unnamed: 0,labels,lang,link,model_decision,mturker,text
81,"[1, 0, 1, 0, 0]",en,https://www.pinterest.com/pin/132504414013213465/,,1,The Elysian Fields..... Lush
84,"[1, 0, 1, 0, 0, 0]",en,http://www.savvyspice.com/2015/08/back-from-ba...,,1,Back from Bali Founder Spills Her Secrets to S...
277,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",en,http://www.100layercake.com/blog/2015/08/03/vi...,,1,Rancho Las Lomas. Obvs they had to change cour...
278,"[1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, ...",en,http://feedproxy.google.com/~r/blogspot/qwWIw/...,,1,Grey & White Wearing: Nordstrom Lush tunic | T...
281,"[1, 1, 1, 1, 0, 1, 1, 1, 1]",en,https://www.youtube.com/watch?v=3ZDlFYkUJj8,,1,"Fall Haul: H&M, Forever21, B&BW, Lush & More! ..."


In [7]:
def label_unpacking(df):
    
    new_labels_list = []
    
    list_of_label_lists = df.iloc[:,0].values # df.labels.values doesn't work ?!
    
    for label_list in list_of_label_lists:
        labels_counter = Counter(label_list)
        if labels_counter[0]>=labels_counter[1]: # Prefer false negatives to false positives
            new_labels_list.append(False)
        else:
            new_labels_list.append(True)
    
    return np.array(new_labels_list)

In [8]:
# Unpack labels and assign them to the column
aggregated_label_arr = label_unpacking(df_tot)
df_tot['aggregated_label'] = aggregated_label_arr

In [9]:
# Initialize empty dataframe
df_res = pd.DataFrame(columns=df_tot.columns.tolist())
df_res

Unnamed: 0,labels,lang,link,model_decision,mturker,text,aggregated_label


In [10]:
languages = ['it', 'es', 'fr']

# Sample per language
for lang in languages:
    # Sample 30 True in that language
    df_res = df_res.append(df_tot[(df_tot.lang == lang) & (df_tot.aggregated_label == True)].sample(n=30, replace=False))
    # Sample 30 False in that language
    df_res = df_res.append(df_tot[(df_tot.lang == lang) & (df_tot.aggregated_label == False)].sample(n=30, replace=False))

In [11]:
df_res

Unnamed: 0,labels,lang,link,model_decision,mturker,text,aggregated_label
2738,"[1, 1, 1, 1]",it,https://www.facebook.com/165693913457722/posts...,,1,"settimana, ad iniziare così le mie giornate! C...",True
4807,"[1, 1, 1, 1]",it,https://www.instagram.com/p/BVmBAWugohj/,,1,#goodmorning #beauties: una tazza di #caffé pe...,True
1409,"[1, 1, 1, 1]",it,https://www.instagram.com/p/BNPVf_rAtpL/,,1,"I simboli sono fortissimi, ma non sono niente ...",True
6260,"[1, 1, 1]",it,https://www.instagram.com/p/BWz31sVh1uL/,,1,Il sole picchia forte forte qui in Sicilia #la...,True
5169,"[1, 1, 0, 1]",it,https://www.instagram.com/p/BRX6pz-hlXA/,,1,#studio #vicandersonmakeup La Porte Space Tagg...,True
4124,"[1, 1, 0, 1]",it,https://blog.cliomakeup.com/2017/05/hair-spa-f...,,1,se ne possono creare anche in casa con ingredi...,True
3074,"[1, 1, 1, 1]",it,https://www.facebook.com/251448664874526/posts...,,1,Sono arrivati in Italia i Collutori Solidi Lus...,True
4822,"[1, 1]",it,http://www.mavie.it/al-mare-lush/,,1,Al mare con Lush Conoscete i solari Lush? S...,True
1326,"[1, 1, 1, 1]",it,http://www.vanitynerd.com/2017/03/01/smokey-ey...,,1,con la compatta.\nPartiamo come sempre dalla b...,True
5403,"[1, 1, 1, 1]",it,https://www.twitter.com/BEAUTYDEAit/statuses/8...,,1,"Ragazze, oggi vi presentiamo la nuova acqua mi...",True


In [12]:
Counter(df_res.aggregated_label)

Counter({False: 90, True: 90})

In [13]:
Counter(df_res.lang)

Counter({'es': 60, 'fr': 60, 'it': 60})

In [14]:
# Add the empty column
df_res['capstone_label'] = None

In [15]:
# Divide in 3 randomly picking per language
list_of_df = []

for i in range(3):
    df = pd.DataFrame(columns=df_res.columns.tolist())
    
    for lang in languages:
        # Sample 10 True in that language
        df = df.append(df_res[(df_res.lang == lang) & (df_res.aggregated_label == True)].sample(n=10, replace=False))
        # Sample 10 False in that language
        df = df.append(df_res[(df_res.lang == lang) & (df_res.aggregated_label == False)].sample(n=10, replace=False))
        
    list_of_df.append(df[columns])

In [16]:
len(list_of_df)

3

In [17]:
# Print one example
list_of_df[0]

Unnamed: 0,text,lang,link,mturker,model_decision,labels,aggregated_label,capstone_label
1409,"I simboli sono fortissimi, ma non sono niente ...",it,https://www.instagram.com/p/BNPVf_rAtpL/,1,,"[1, 1, 1, 1]",True,
4822,Al mare con Lush Conoscete i solari Lush? S...,it,http://www.mavie.it/al-mare-lush/,1,,"[1, 1]",True,
424,è caldissimo e non mi dimentico di prendermi c...,it,https://www.instagram.com/p/BHbbeiqAEiY/,1,,"[1, 1, 1, 1]",True,
3396,✿ Vlog - EVENTO LUSH PASQUA - Gelatine a form...,it,https://www.youtube.com/watch?v=o9Cgkk0yvt4,1,,"[1, 1]",True,
5169,#studio #vicandersonmakeup La Porte Space Tagg...,it,https://www.instagram.com/p/BRX6pz-hlXA/,1,,"[1, 1, 0, 1]",True,
6493,perché sarà online il video Sommersa dai Pacch...,it,https://www.instagram.com/p/BW7CxLPFrtF/,1,,"[1, 1, 1]",True,
2951,"siliconi, Ceteareth-25 (un emulsionante deriva...",it,http://ecocentrica.it/protezioni-solari-filtri...,1,,"[1, 1, 1]",True,
1353,"Dove, #RealHappiness! \n\nScopri tutti i prodo...",it,https://www.instagram.com/p/BMhSoRpgK_6/,1,,"[1, 1, 1, 1]",True,
3074,Sono arrivati in Italia i Collutori Solidi Lus...,it,https://www.facebook.com/251448664874526/posts...,1,,"[1, 1, 1, 1]",True,
2798,@lush_italia ieri ho fatto qualche danno Buo...,it,https://www.instagram.com/p/BPMoQBtFJkf/,1,,"[1, 1, 0, 0, 1, 1, 1, 1, 1]",True,


In [18]:
# Store to 3 different csv files
list_of_df[0].to_csv('../../Data/csv_for_mturk_eval/alessandro.csv', index=False, encoding='utf-8')
list_of_df[1].to_csv('../../Data/csv_for_mturk_eval/andrea.csv', index=False, encoding='utf-8')
list_of_df[2].to_csv('../../Data/csv_for_mturk_eval/ginevra.csv', index=False, encoding='utf-8')

In [19]:
# Check if the import works fine
pd.read_csv('../../Data/csv_for_mturk_eval/alessandro.csv', encoding='utf-8')

Unnamed: 0,text,lang,link,mturker,model_decision,labels,aggregated_label,capstone_label
0,"I simboli sono fortissimi, ma non sono niente ...",it,https://www.instagram.com/p/BNPVf_rAtpL/,1,,"[1, 1, 1, 1]",True,
1,Al mare con Lush Conoscete i solari Lush? S...,it,http://www.mavie.it/al-mare-lush/,1,,"[1, 1]",True,
2,è caldissimo e non mi dimentico di prendermi c...,it,https://www.instagram.com/p/BHbbeiqAEiY/,1,,"[1, 1, 1, 1]",True,
3,✿ Vlog - EVENTO LUSH PASQUA - Gelatine a form...,it,https://www.youtube.com/watch?v=o9Cgkk0yvt4,1,,"[1, 1]",True,
4,#studio #vicandersonmakeup La Porte Space Tagg...,it,https://www.instagram.com/p/BRX6pz-hlXA/,1,,"[1, 1, 0, 1]",True,
5,perché sarà online il video Sommersa dai Pacch...,it,https://www.instagram.com/p/BW7CxLPFrtF/,1,,"[1, 1, 1]",True,
6,"siliconi, Ceteareth-25 (un emulsionante deriva...",it,http://ecocentrica.it/protezioni-solari-filtri...,1,,"[1, 1, 1]",True,
7,"Dove, #RealHappiness! \r\n\r\nScopri tutti i p...",it,https://www.instagram.com/p/BMhSoRpgK_6/,1,,"[1, 1, 1, 1]",True,
8,Sono arrivati in Italia i Collutori Solidi Lus...,it,https://www.facebook.com/251448664874526/posts...,1,,"[1, 1, 1, 1]",True,
9,@lush_italia ieri ho fatto qualche danno Buo...,it,https://www.instagram.com/p/BPMoQBtFJkf/,1,,"[1, 1, 0, 0, 1, 1, 1, 1, 1]",True,
