# Import libraries

In [1]:
import pandas as pd
import numpy as np

import codecs, json

from collections import Counter

# Function to import json files

In [2]:
def brand_dataframe_to_append(brand_id):
    
    brand_id_str = str(brand_id)
    
    # Open json files
    with codecs.open('../../Data/CSE_20180215/' + brand_id_str + '_data.json', 'r', 'utf-8') as f_data:
        tweets_dict_list = json.load(f_data, encoding='utf-8')
    with codecs.open('../../Data/CSE_20180215/' + brand_id_str + '_metadata.json') as f_metadata:
        metadata_dict = json.load(f_metadata, encoding='utf-8')
        
    # Import as dataframe
    df = pd.DataFrame.from_dict(tweets_dict_list)
    
    # Add brand_id and brand_name columns to dataframe
    brand_id_column = np.repeat(metadata_dict['brand_id'], df.shape[0])
    brand_name_column = np.repeat(metadata_dict['brand_name'], df.shape[0])    
    df['brand_id'] = brand_id_column
    df['brand_name'] = brand_name_column
    
    # Define the languages to keep    
    languages = ['en', 'it', 'fr', 'es']    
    # REFACTOR HERE!
    lang_df = df[(df.lang == 'en') | (df.lang == 'it') | (df.lang == 'fr') | (df.lang == 'es')]
    
    # Return the dataframe with this column layout
    return lang_df[['brand_id', 'brand_name', 'text', 'lang', 'model_decision', 'labels', 'mturker', 'link']]

# Add all the files as a unique dataframe

In [3]:
first_df = brand_dataframe_to_append(8009)
df_res = first_df.copy()

In [4]:
from pathlib import Path

# Loop over all the file ids provided
for brand_id in range(8033, 19151):
    
    # Check existence of file
    try_file = Path("../../Data/CSE_20180215/" + str(brand_id) + "_data.json")
    
    if try_file.is_file():
        # file exists
        df_res = pd.concat([df_res, brand_dataframe_to_append(brand_id)])

# Output the result

In [5]:
df_res.head(5)

Unnamed: 0,brand_id,brand_name,text,lang,model_decision,labels,mturker,link
0,8009,Caudalie,Favorites Summer Crushes - $30This is another ...,en,,"[1, 1, 1, 1]",1,http://www.thebeautyisle.com/2015/01/sephora-u...
1,8009,Caudalie,.@CaudalieUSA’s new C15 line takes charge of o...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/Sephora/statuses/46565...
2,8009,Caudalie,"I love a simple, cheap and easy to find produc...",en,,"[1, 1]",1,http://www.la-fille-en-rose.com/2012/10/dry-sk...
3,8009,Caudalie,Yay it's @CaudalieUSA turn #sephora #weareseph...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/CaitlinAPearl/statuses...
4,8009,Caudalie,@LadyAndTheLook I started using a face oil fro...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/stylecontext/statuses/...


In [6]:
df_res.tail(5)

Unnamed: 0,brand_id,brand_name,text,lang,model_decision,labels,mturker,link
17,19150,Embelleze,Spring is in the air! #novexhaircare #embellez...,en,,"[1, 1, 1, 1]",1,https://www.instagram.com/p/BRIo0Zwlhqb/
43,19150,Embelleze,Happy International Women's Day! #novexhaircar...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/Novex_haircare/statuse...
51,19150,Embelleze,Rosto de neve shared Embelleze's live video.,en,,"[1, 1, 1]",1,https://www.facebook.com/286138848152798/posts...
54,19150,Embelleze,Good Morning! It's Friday! Try our coconut hai...,en,,"[1, 1, 1]",1,https://www.instagram.com/p/BSTNmShFL7A/
70,19150,Embelleze,http://womenspleasuresandtreasures.blogspot.pt...,en,,"[1, 1, 1, 1]",1,https://www.instagram.com/p/BTWzYfigMyu/


In [7]:
df_res.shape

(121939, 8)

In [8]:
Counter(df_res.lang)

Counter({'en': 98616, 'es': 9858, 'fr': 7272, 'it': 6193})

In [9]:
Counter(df_res.mturker)

Counter({0: 67934, 1: 54005})

In [10]:
# Output csv file
#import csv
df_res.to_csv("../../Data/CSE_20180215/tribe_new_brand_data_en_es_fr_it.csv", index=False, encoding='utf-8')#, quoting=csv.QUOTE_NONNUMERIC)

In [11]:
# Check if it is readable
df = pd.read_csv("../../Data/CSE_20180215/tribe_new_brand_data_en_es_fr_it.csv")
df.head(5)

Unnamed: 0,brand_id,brand_name,text,lang,model_decision,labels,mturker,link
0,8009,Caudalie,Favorites Summer Crushes - $30This is another ...,en,,"[1, 1, 1, 1]",1,http://www.thebeautyisle.com/2015/01/sephora-u...
1,8009,Caudalie,.@CaudalieUSA’s new C15 line takes charge of o...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/Sephora/statuses/46565...
2,8009,Caudalie,"I love a simple, cheap and easy to find produc...",en,,"[1, 1]",1,http://www.la-fille-en-rose.com/2012/10/dry-sk...
3,8009,Caudalie,Yay it's @CaudalieUSA turn #sephora #weareseph...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/CaitlinAPearl/statuses...
4,8009,Caudalie,@LadyAndTheLook I started using a face oil fro...,en,,"[1, 1, 1, 1]",1,https://www.twitter.com/stylecontext/statuses/...
