In [1]:
import pandas as pd
import numpy as np
import codecs, json
from collections import Counter

In [2]:
#Selected brands: Lush ; La Roche-Posay ; Dove ; Vichy
brand_ids = ['13578', '12003','14680','12857']

In [3]:
# Columns to output on the final dataset
columns = ['brand_name',
           'text',
           'lang',
           'link',
           'mturker',
           'model_decision',
           'labels',
           'aggregated_label',
           'capstone_label']

In [4]:
# Extract all json files
dict_list = []
brand_names_list = []

for brand_id in brand_ids:
       
    # Open json files
    with codecs.open('../../Data/CSE_20180215/' + brand_id + '_data.json', 'r', 'utf-8') as f_data:
        dict_list.append(json.load(f_data, encoding='utf-8'))
    with codecs.open('../../Data/CSE_20180215/' + str(brand_id) + '_metadata.json') as f_metadata:
        brand_names_list.append(json.load(f_metadata, encoding='utf-8')['brand_name'])

In [5]:
# Extract dataframes from the list of dictionaries
df_lush = pd.DataFrame.from_dict(dict_list[0])
df_lush['brand_name'] = brand_names_list[0]

df_roche = pd.DataFrame.from_dict(dict_list[1])
df_roche['brand_name'] = brand_names_list[1]

df_dove = pd.DataFrame.from_dict(dict_list[2])
df_dove['brand_name'] = brand_names_list[2]

df_vichy = pd.DataFrame.from_dict(dict_list[3])
df_vichy['brand_name'] = brand_names_list[3]

In [6]:
# Create big dataframe as a concatenation of all of them
df_tot = pd.concat([df_lush, df_roche, df_dove, df_vichy])
 # Only keep posts from mturker
df_tot = df_tot[df_tot.mturker == 1]

print(df_tot.shape)
df_tot.head(5)

(14167, 7)


Unnamed: 0,labels,lang,link,model_decision,mturker,text,brand_name
81,"[1, 0, 1, 0, 0]",en,https://www.pinterest.com/pin/132504414013213465/,,1,The Elysian Fields..... Lush,Lush
84,"[1, 0, 1, 0, 0, 0]",en,http://www.savvyspice.com/2015/08/back-from-ba...,,1,Back from Bali Founder Spills Her Secrets to S...,Lush
277,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",en,http://www.100layercake.com/blog/2015/08/03/vi...,,1,Rancho Las Lomas. Obvs they had to change cour...,Lush
278,"[1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, ...",en,http://feedproxy.google.com/~r/blogspot/qwWIw/...,,1,Grey & White Wearing: Nordstrom Lush tunic | T...,Lush
281,"[1, 1, 1, 1, 0, 1, 1, 1, 1]",en,https://www.youtube.com/watch?v=3ZDlFYkUJj8,,1,"Fall Haul: H&M, Forever21, B&BW, Lush & More! ...",Lush


In [7]:
def label_unpacking(df):
    
    new_labels_list = []
    
    list_of_label_lists = df.iloc[:,0].values # df.labels.values doesn't work ?!
    
    for label_list in list_of_label_lists:
        labels_counter = Counter(label_list)
        if labels_counter[0]>=labels_counter[1]: # Prefer false negatives to false positives
            new_labels_list.append(False)
        else:
            new_labels_list.append(True)
    
    return np.array(new_labels_list)

In [8]:
# Unpack labels and assign them to the column
aggregated_label_arr = label_unpacking(df_tot)
df_tot['aggregated_label'] = aggregated_label_arr

In [9]:
# Initialize empty dataframe
df_res = pd.DataFrame(columns=df_tot.columns.tolist())
df_res

Unnamed: 0,labels,lang,link,model_decision,mturker,text,brand_name,aggregated_label


In [10]:
languages = ['it', 'es', 'fr']

# Sample per language
for lang in languages:
    # Sample 30 True in that language
    df_res = df_res.append(df_tot[(df_tot.lang == lang) & (df_tot.aggregated_label == True)].sample(n=30, replace=False))
    # Sample 30 False in that language
    df_res = df_res.append(df_tot[(df_tot.lang == lang) & (df_tot.aggregated_label == False)].sample(n=30, replace=False))

In [11]:
df_res

Unnamed: 0,labels,lang,link,model_decision,mturker,text,brand_name,aggregated_label
2827,"[1, 0, 1, 0, 1, 1, 1, 1, 1, 1]",it,https://www.instagram.com/p/BPXGFwNjIbC/,,1,pronto a sciogliersi per amore? Cupido ha scoc...,Lush,True
1193,"[1, 1, 1, 1]",it,https://www.twitter.com/Laura_theoldnow/status...,,1,Cicaplast Baume B5 SPF 50 di #LaRochePosay è l...,La Roche-Posay,True
4702,"[1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, ...",it,https://www.instagram.com/p/BXxQgDYlLta/,,1,la cura delle ciglia naturali: con questo spe...,Vichy,True
1268,"[1, 1, 1, 0]",it,https://www.youtube.com/watch?v=XItKOoQWY2Q,,1,Dove son finita Come promesso sono tornata!\n ...,Dove,True
4543,"[1, 1, 1, 1]",it,https://www.instagram.com/p/BQU0tPdjAci/,,1,al rientro. Più comoda e pratica la nuova pomp...,La Roche-Posay,True
5588,"[1, 1, 1]",it,https://www.instagram.com/p/BWMhQu3FsZo/,,1,#giveaway \nQuesto giovedì sarà più allegro! \...,Lush,True
3172,"[1, 1]",it,https://blog.cliomakeup.com/2017/03/arco-di-cu...,,1,"da averle morbide, idratate e prive di pellici...",Lush,True
2563,"[1, 1, 1]",it,https://www.vanityfair.it/beauty/viso-e-corpo/...,,1,Sun Pharma. Mentre per mantenere la pelle semp...,Vichy,True
3591,"[1, 1, 1, 1]",it,https://www.instagram.com/p/BTdwzLnhUZJ/,,1,buonanotte della mamma prima di andare a dormi...,Lush,True
3236,"[1, 1, 1, 1, 1]",it,https://www.instagram.com/p/BSWqwr-BQAt/,,1,MY BEAUTY PROTOCOLE \n#pink #girlsnightout #h...,Lush,True


In [12]:
Counter(df_res.aggregated_label)

Counter({False: 90, True: 90})

In [13]:
Counter(df_res.lang)

Counter({'es': 60, 'fr': 60, 'it': 60})

In [14]:
# Add the empty column
df_res['capstone_label'] = None

In [15]:
# Divide in 3 randomly picking per language
list_of_df = []

for i in range(3):
    df = pd.DataFrame(columns=df_res.columns.tolist())
    
    for lang in languages:
        # Sample 10 True in that language
        df = df.append(df_res[(df_res.lang == lang) & (df_res.aggregated_label == True)].sample(n=10, replace=False))
        # Sample 10 False in that language
        df = df.append(df_res[(df_res.lang == lang) & (df_res.aggregated_label == False)].sample(n=10, replace=False))
        
    list_of_df.append(df[columns])

In [16]:
len(list_of_df)

3

In [17]:
# Print one example
list_of_df[0]

Unnamed: 0,brand_name,text,lang,link,mturker,model_decision,labels,aggregated_label,capstone_label
3172,Lush,"da averle morbide, idratate e prive di pellici...",it,https://blog.cliomakeup.com/2017/03/arco-di-cu...,1,,"[1, 1]",True,
1268,Dove,Dove son finita Come promesso sono tornata!\n ...,it,https://www.youtube.com/watch?v=XItKOoQWY2Q,1,,"[1, 1, 1, 0]",True,
1497,Vichy,"un momento critico per la nostra pelle, ma con...",it,https://www.instagram.com/p/BSybl-nAlvG/,1,,"[1, 1, 1, 1]",True,
3236,Lush,MY BEAUTY PROTOCOLE \n#pink #girlsnightout #h...,it,https://www.instagram.com/p/BSWqwr-BQAt/,1,,"[1, 1, 1, 1, 1]",True,
3814,La Roche-Posay,"@avon.italia , Gimme Brow di @benefitcosmetics...",it,https://www.facebook.com/1555633388011833/post...,1,,"[1, 1, 1, 1]",True,
3952,La Roche-Posay,"CI 75470, con un notevole grado di tossicità e...",it,http://ecocentrica.it/cipria-attenzione-ai-sil...,1,,"[1, 1, 1, 1]",True,
4702,Vichy,la cura delle ciglia naturali: con questo spe...,it,https://www.instagram.com/p/BXxQgDYlLta/,1,,"[1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, ...",True,
3061,Vichy,"tre prodotti che, ovviamente, ho subito acquis...",it,http://www.mammeaspillo.it/stare-bene/estate-b...,1,,"[1, 1, 1, 1, 1, 1, 1]",True,
2563,Vichy,Sun Pharma. Mentre per mantenere la pelle semp...,it,https://www.vanityfair.it/beauty/viso-e-corpo/...,1,,"[1, 1, 1]",True,
1296,La Roche-Posay,Grazie a #LaRochePosay per Respectissime Multi...,it,https://www.twitter.com/KiaScricc/statuses/780...,1,,"[1, 1, 1, 1]",True,


In [18]:
# Store to 3 different csv files
list_of_df[0].to_csv('../../Data/csv_for_mturk_eval/alessandro.csv', index=False, encoding='utf-8')
list_of_df[1].to_csv('../../Data/csv_for_mturk_eval/andrea.csv', index=False, encoding='utf-8')
list_of_df[2].to_csv('../../Data/csv_for_mturk_eval/ginevra.csv', index=False, encoding='utf-8')

In [19]:
# Check if the import works fine
pd.read_csv('../../Data/csv_for_mturk_eval/alessandro.csv', encoding='utf-8')

Unnamed: 0,brand_name,text,lang,link,mturker,model_decision,labels,aggregated_label,capstone_label
0,Lush,"da averle morbide, idratate e prive di pellici...",it,https://blog.cliomakeup.com/2017/03/arco-di-cu...,1,,"[1, 1]",True,
1,Dove,Dove son finita Come promesso sono tornata!\r\...,it,https://www.youtube.com/watch?v=XItKOoQWY2Q,1,,"[1, 1, 1, 0]",True,
2,Vichy,"un momento critico per la nostra pelle, ma con...",it,https://www.instagram.com/p/BSybl-nAlvG/,1,,"[1, 1, 1, 1]",True,
3,Lush,MY BEAUTY PROTOCOLE \r\n#pink #girlsnightout ...,it,https://www.instagram.com/p/BSWqwr-BQAt/,1,,"[1, 1, 1, 1, 1]",True,
4,La Roche-Posay,"@avon.italia , Gimme Brow di @benefitcosmetics...",it,https://www.facebook.com/1555633388011833/post...,1,,"[1, 1, 1, 1]",True,
5,La Roche-Posay,"CI 75470, con un notevole grado di tossicità e...",it,http://ecocentrica.it/cipria-attenzione-ai-sil...,1,,"[1, 1, 1, 1]",True,
6,Vichy,la cura delle ciglia naturali: con questo spe...,it,https://www.instagram.com/p/BXxQgDYlLta/,1,,"[1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, ...",True,
7,Vichy,"tre prodotti che, ovviamente, ho subito acquis...",it,http://www.mammeaspillo.it/stare-bene/estate-b...,1,,"[1, 1, 1, 1, 1, 1, 1]",True,
8,Vichy,Sun Pharma. Mentre per mantenere la pelle semp...,it,https://www.vanityfair.it/beauty/viso-e-corpo/...,1,,"[1, 1, 1]",True,
9,La Roche-Posay,Grazie a #LaRochePosay per Respectissime Multi...,it,https://www.twitter.com/KiaScricc/statuses/780...,1,,"[1, 1, 1, 1]",True,
