### Imports

In [48]:
import pandas as pd
import preprocessor as p
from emoji import demojize
import capir_transfronteriza2_2023.data.load as load

import warnings
warnings.filterwarnings("ignore")

### Load data

In [46]:
data_processed = load.data_processed

### Read data

In [49]:
# Read csv file as dataframe
df = pd.read_csv(f'{data_processed}/tweets.csv')

# Print dataframe shape
print(df.shape)

# Show dataframe
df.head(3)

(200827, 62)


Unnamed: 0,query,id,timestamp_utc,local_time,user_screen_name,text,possibly_sensitive,retweet_count,like_count,reply_count,...,media_files,media_types,media_alt_texts,mentioned_names,mentioned_ids,hashtags,intervention_type,intervention_text,intervention_url,country
0,from:TommyZambranoM,1.638175e+18,1679406309,2023-03-21T13:45:09,TommyZambranoM,Los Nacionalistas para lograr la renovación de...,0.0,30.0,117.0,58.0,...,1638174913906327552_n59p6Id5p1YViXtP.mp4,video,,pnh_oficial,201589327.0,librenuncamas,,,,Honduras
1,from:TommyZambranoM,1.6378e+18,1679316989,2023-03-20T12:56:29,TommyZambranoM,#LasÑangaradas de la Semana:\n1) Sin comer tie...,0.0,78.0,292.0,178.0,...,1637800275900604418_Frqiq2uWwAAq6ZM.jpg,photo,,,,lasñangaradas,,,,Honduras
2,from:TommyZambranoM,1.637795e+18,1679315747,2023-03-20T12:35:47,TommyZambranoM,"Mira el futuro con Fe , llénate de esperanza y...",,12.0,61.0,14.0,...,,,,,,tommyzambrano,,,,Honduras


### Process data

In [16]:
df['country'].value_counts()

country
Colombia    57741
Honduras    56212
Brasil      52155
Ecuador     34717
Name: count, dtype: int64

In [39]:
country_list = df['country'].unique().tolist()

countries = list(filter(lambda x: isinstance(x, str), country_list))

countries

['Honduras', 'Brasil', 'Ecuador', 'Colombia']

In [50]:
for i in range(len(countries)):

    # Filter data
    country = df[df['country'] == countries[i]]

    # Remove duplicates
    country.drop_duplicates(subset="text", keep='first', inplace=True)

    # Remove urls
    p.set_options(p.OPT.URL)
    country['text_clean'] = country['text'].apply(lambda x: p.clean(x))

    # Tokenize mentions and hashtags
    p.set_options(p.OPT.MENTION, p.OPT.HASHTAG)
    country['text_clean'] = country['text_clean'].apply(lambda x: p.tokenize(x))

    # Replace emojis with descriptions
    country['text_clean'] = country['text_clean'].apply(lambda x: demojize(x))

    # Filter columns
    country_fil = country[['user_screen_name', 'country', 'text', 'text_clean']]

    # Set up filename in lower case
    filename = countries[i].lower()

    # Print dataframe shape
    print(f'{countries[i]}: {country_fil.shape}')
    
    # Save dataframe as 'csv' file
    country_fil.to_csv(f'{data_processed}/{filename}.csv', index=False)


Honduras: (55733, 4)
Brasil: (47697, 4)
Ecuador: (34693, 4)
Colombia: (57477, 4)
