In [1]:
import pandas as pd
import numpy as np
import codecs, json
from collections import Counter

In [2]:
# Empty dataframe
columns = ['brand_id',
           'brand_name',
           'number_total_posts',
           'imbalance_ratio_on_total_posts',
           'number_instagram_on_total_posts',
           'number_twitter_on_total_posts',
           'number_facebook_on_total_posts',
           'number_youtube_on_total_posts',
           'number_pinterest_on_total_posts',
           'number_others_on_total_posts',
           'number_en_posts',
           'imbalance_ratio_on_en_posts',
           'number_instagram_on_en_posts',
           'number_twitter_on_en_posts',
           'number_facebook_on_en_posts',
           'number_youtube_on_en_posts',
           'number_pinterest_on_en_posts',
           'number_others_on_en_posts',
           'number_it_posts',
           'imbalance_ratio_on_it_posts',
           'number_instagram_on_it_posts',
           'number_twitter_on_it_posts',
           'number_facebook_on_it_posts',
           'number_youtube_on_it_posts',
           'number_pinterest_on_it_posts',
           'number_others_on_it_posts',
           'number_fr_posts',
           'imbalance_ratio_on_fr_posts',
           'number_instagram_on_fr_posts',
           'number_twitter_on_fr_posts',
           'number_facebook_on_fr_posts',
           'number_youtube_on_fr_posts',
           'number_pinterest_on_fr_posts',
           'number_others_on_fr_posts',
           'number_es_posts',
           'imbalance_ratio_on_es_posts',
           'number_instagram_on_es_posts',
           'number_twitter_on_es_posts',
           'number_facebook_on_es_posts',
           'number_youtube_on_es_posts',
           'number_pinterest_on_es_posts',
           'number_others_on_es_posts',       
           'number_labels_from_mturk',
           'number_labels_from_tribe',
           'model_decision_always_empty']

df_res = pd.DataFrame(columns=columns)
df_res

Unnamed: 0,brand_id,brand_name,number_total_posts,imbalance_ratio_on_total_posts,number_instagram_on_total_posts,number_twitter_on_total_posts,number_facebook_on_total_posts,number_youtube_on_total_posts,number_pinterest_on_total_posts,number_others_on_total_posts,...,imbalance_ratio_on_es_posts,number_instagram_on_es_posts,number_twitter_on_es_posts,number_facebook_on_es_posts,number_youtube_on_es_posts,number_pinterest_on_es_posts,number_others_on_es_posts,number_labels_from_mturk,number_labels_from_tribe,model_decision_always_empty


In [3]:
def label_unpacking(df):
    
    new_labels_list = []
    
    list_of_label_lists = df.iloc[:,0].values # df.labels.values doesn't work ?!
    
    for label_list in list_of_label_lists:
        labels_counter = Counter(label_list)
        if labels_counter[0]>=labels_counter[1]: # Prefer false negatives to false positives
            new_labels_list.append(False)
        else:
            new_labels_list.append(True)
    
    return np.array(new_labels_list)

In [4]:
def imbalance_by_df(df):    

    labels_list = label_unpacking(df)
    
    num_True = Counter(labels_list)[True]
    num_False = Counter(labels_list)[False]
    
    if (num_True==0 and num_False==0):
        return 0
    else:
        return 1-(min(num_True,num_False)/max(num_True,num_False))   

In [5]:
def remove_links(corpus, link_start):
    
    new_corpus = []    
    for s in corpus:            
        #First parameter is the replacement, second parameter is the input string
        new_corpus.append(s.replace(link_start,''))    
        
    return np.array(new_corpus)

In [6]:
def remove_regexp(corpus, regexp_str):
    
    import re   
    
    regex = re.compile(regexp_str)
    
    new_corpus = []    
    for s in corpus:            
        #First parameter is the replacement, second parameter is the input string
        new_corpus.append(regex.sub('', s))    
        
    return np.array(new_corpus)    

In [7]:
def count_social_by_df(df):
    
    corpus = df.link.values.copy()        
    corpus = remove_links(corpus, 'http://www.')
    corpus = remove_links(corpus, 'https://www.')
    corpus = remove_links(corpus, 'http://')
    corpus = remove_links(corpus, 'https://')
    corpus = remove_regexp(corpus, '.com\S+')        
    counter = Counter(corpus)
    ig=counter['instagram']
    tw=counter['twitter']
    fb=counter['facebook']
    yt=counter['youtube']
    pin=counter['pinterest']
    others=df.shape[0]-(ig+tw+fb+yt+pin)
    
    return ig, tw, fb, yt, pin, others

In [8]:
from pathlib import Path

i = 0

# Loop over all the file ids provided
for brand_id in range(8009, 19151):
    
    # Check existence of file
    try_file = Path("../../Data/CSE_20180215/" + str(brand_id) + "_data.json")
    
    if try_file.is_file():
        # file exists
        
        # Open json files
        with codecs.open('../../Data/CSE_20180215/' + str(brand_id) + '_data.json', 'r', 'utf-8') as f_data:
            tweets_dict_list = json.load(f_data, encoding='utf-8')
        with codecs.open('../../Data/CSE_20180215/' + str(brand_id) + '_metadata.json') as f_metadata:
            metadata_dict = json.load(f_metadata, encoding='utf-8')
        
        # Import as dataframe
        df = pd.DataFrame.from_dict(tweets_dict_list)
        
        df_to_append = pd.DataFrame(index=[i], columns=columns)
        df_to_append = df_to_append.fillna(0)
                
        df_to_append['brand_id'] = metadata_dict['brand_id']
        df_to_append['brand_name'] = metadata_dict['brand_name']

        df_to_append['number_total_posts'] = df.shape[0]
        df_to_append['imbalance_ratio_on_total_posts'] = imbalance_by_df(df)
        ig, tw, fb, yt, pin, others = count_social_by_df(df)
        df_to_append['number_instagram_on_total_posts'] = ig
        df_to_append['number_twitter_on_total_posts'] = tw
        df_to_append['number_facebook_on_total_posts'] = fb
        df_to_append['number_youtube_on_total_posts'] = yt
        df_to_append['number_pinterest_on_total_posts'] = pin
        df_to_append['number_others_on_total_posts'] = others

        df_lang = df[df.lang == 'en']
        df_to_append['number_en_posts'] = df_lang.shape[0]
        df_to_append['imbalance_ratio_on_en_posts'] = imbalance_by_df(df_lang)
        ig, tw, fb, yt, pin, others = count_social_by_df(df_lang)
        df_to_append['number_instagram_on_en_posts'] = ig
        df_to_append['number_twitter_on_en_posts'] = tw
        df_to_append['number_facebook_on_en_posts'] = fb
        df_to_append['number_youtube_on_en_posts'] = yt
        df_to_append['number_pinterest_on_en_posts'] = pin
        df_to_append['number_others_on_en_posts'] = others

        df_lang = df[df.lang == 'it']
        df_to_append['number_it_posts'] = df_lang.shape[0]
        df_to_append['imbalance_ratio_on_it_posts'] = imbalance_by_df(df_lang)
        ig, tw, fb, yt, pin, others = count_social_by_df(df_lang)
        df_to_append['number_instagram_on_it_posts'] = ig
        df_to_append['number_twitter_on_it_posts'] = tw
        df_to_append['number_facebook_on_it_posts'] = fb
        df_to_append['number_youtube_on_it_posts'] = yt
        df_to_append['number_pinterest_on_it_posts'] = pin
        df_to_append['number_others_on_it_posts'] = others

        df_lang = df[df.lang == 'fr']
        df_to_append['number_fr_posts'] = df_lang.shape[0]
        df_to_append['imbalance_ratio_on_fr_posts'] = imbalance_by_df(df_lang)
        ig, tw, fb, yt, pin, others = count_social_by_df(df_lang)
        df_to_append['number_instagram_on_fr_posts'] = ig
        df_to_append['number_twitter_on_fr_posts'] = tw
        df_to_append['number_facebook_on_fr_posts'] = fb
        df_to_append['number_youtube_on_fr_posts'] = yt
        df_to_append['number_pinterest_on_fr_posts'] = pin
        df_to_append['number_others_on_fr_posts'] = others

        df_lang = df[df.lang == 'es']
        df_to_append['number_es_posts'] = df_lang.shape[0]
        df_to_append['imbalance_ratio_on_es_posts'] = imbalance_by_df(df_lang)
        ig, tw, fb, yt, pin, others = count_social_by_df(df_lang)
        df_to_append['number_instagram_on_es_posts'] = ig
        df_to_append['number_twitter_on_es_posts'] = tw
        df_to_append['number_facebook_on_es_posts'] = fb
        df_to_append['number_youtube_on_es_posts'] = yt
        df_to_append['number_pinterest_on_es_posts'] = pin
        df_to_append['number_others_on_es_posts'] = others

        df_to_append['number_labels_from_mturk'] = df[df.mturker == 1].shape[0]
        df_to_append['number_labels_from_tribe'] = df[df.mturker != 1].shape[0]

        # Counter has one key if there is only 'None', otherwise has more
        n = len(Counter(df.model_decision.values))
        if n>1:
            result = False
        else:
            result = True
        df_to_append['model_decision_always_empty'] = result
        
        df_res = df_res.append(df_to_append)[columns]
        
        i=i+1

In [9]:
df_res

Unnamed: 0,brand_id,brand_name,number_total_posts,imbalance_ratio_on_total_posts,number_instagram_on_total_posts,number_twitter_on_total_posts,number_facebook_on_total_posts,number_youtube_on_total_posts,number_pinterest_on_total_posts,number_others_on_total_posts,...,imbalance_ratio_on_es_posts,number_instagram_on_es_posts,number_twitter_on_es_posts,number_facebook_on_es_posts,number_youtube_on_es_posts,number_pinterest_on_es_posts,number_others_on_es_posts,number_labels_from_mturk,number_labels_from_tribe,model_decision_always_empty
0,8009,Caudalie,1188,0.992366,587,171,62,169,11,188,...,1.000000,63,4,0,2,0,2,468,720,True
1,8033,Mary Kay,3917,0.947609,1624,491,728,379,148,547,...,0.994633,187,81,147,87,2,58,3342,575,True
2,8499,Too Faced,1299,0.997685,734,29,58,427,13,38,...,0.983871,43,0,7,13,0,0,368,931,True
3,9305,Avon,4107,0.553873,1401,761,443,357,147,998,...,0.935223,147,25,38,13,0,40,1690,2417,True
4,9337,Kérastase,821,0.997558,387,44,88,91,9,202,...,1.000000,35,1,13,4,0,12,133,688,True
5,9932,Givenchy,355,1.000000,74,19,23,59,11,169,...,1.000000,0,0,2,0,0,2,259,96,True
6,9996,Lancôme,1418,0.997171,728,46,66,422,12,144,...,1.000000,27,0,0,10,0,1,303,1115,True
7,10118,NARS,1391,0.994942,686,22,43,451,18,171,...,1.000000,49,0,1,10,0,7,300,1091,True
8,10195,Neutrogena,716,0.997199,283,50,55,113,5,210,...,1.000000,39,0,14,8,0,6,231,485,True
9,10918,Anastasia Beverly Hills,1245,0.943124,778,2,39,397,3,26,...,1.000000,23,0,0,5,0,0,58,1187,True


In [10]:
# Output df to Excel file
writer = pd.ExcelWriter('../../Data/CSE_20180215_data_info.xlsx')
df_res.to_excel(writer,'Sheet1')
writer.save()