In [1]:
import pandas as pd
df_tweets = pd.read_csv("data/tweets_elections_raw.csv",
                        sep = ";", 
                        encoding = "utf-8",
                        quotechar = "\"",
                        dtype={'id': object, 'user_id': object})  


# Cluster 1: Boric, CLuster 2: Kast
df_tweets["source"] = "twitter"

# Only rows with language = "es"
df_tweets = df_tweets[df_tweets["language"] == "es"]

df_tweets = df_tweets.rename(columns={'tweet': 'text', 'cluster': 'candidato'})
df_tweets = df_tweets[["user_id", "date", "text", "candidato", "source"]]
df_tweets = df_tweets.reset_index(drop=True)

# Replace candidato == 1 with "Boric" and candidato == 2 with "Kast" 
df_tweets["candidato"] = df_tweets["candidato"].replace(1, "Boric")
df_tweets["candidato"] = df_tweets["candidato"].replace(2, "Kast")

df_tweets.head(3)


Unnamed: 0,user_id,date,text,candidato,source
0,1442114017238028298,2021-11-03 00:00:02,@joseantoniokast @DanyJaneLpez1 Aquí lo que fr...,Kast,twitter
1,1442114017238028298,2021-11-03 00:03:01,@joseantoniokast @DanyJaneLpez1 Tenemos que un...,Kast,twitter
2,1442114017238028298,2021-11-03 00:06:37,@joseantoniokast @DanyJaneLpez1 Somos libres s...,Kast,twitter


In [2]:
df_wsp = pd.read_pickle("data/wsp_elections_raw.pkl")

df_wsp = df_wsp.rename(columns={'remote_resource': 'user_id', 'data': 'text', 'key_remote_jid': 'group_id'})
df_wsp = df_wsp[["user_id", "date", "text", "candidato", "key_from_me"]]

# Remove rows with key_from_me = 1
df_wsp = df_wsp[df_wsp.key_from_me == 0]

df_wsp = df_wsp.dropna(subset=["text"])
# Remove rows with empty text or NaNs:
df_wsp = df_wsp[df_wsp.text.notnull()]
df_wsp = df_wsp[df_wsp.text != '']
del df_wsp["key_from_me"]
df_wsp = df_wsp.reset_index(drop=True)
df_wsp["source"] = "whatsapp"

df_wsp.head(3)


Unnamed: 0,user_id,date,text,candidato,source
0,56973794459@s.whatsapp.net,2021-11-03 21:34:46,*AtreviDos*: Resumen Diario ✌🏼\n\n1️⃣ Por nues...,Kast,whatsapp
1,56973794459@s.whatsapp.net,2021-11-04 20:45:55,🔴 *AHORA*\n\nExisten prejuicios y caricaturas ...,Kast,whatsapp
2,56973794459@s.whatsapp.net,2021-11-04 21:45:32,*AtreviDos*: Resumen Diario ✌🏼\n\n1️⃣ *Atrévet...,Kast,whatsapp


In [3]:
df = pd.concat([df_tweets, df_wsp], ignore_index=True)

In [4]:
from preprocess import SpanishPreprocess 
import swifter 

sp = SpanishPreprocess(lower=True, remove_url=True, remove_hashtags = True, convert_emoticons=False, convert_emojis=False, normalize_inclusive_language=False, reduce_spam=True,
                        remove_vowels_accents = True, remove_punctuation=True, remove_unprintable=True, remove_numbers=True, remove_stopwords=False, stopwords_list=None, stem=False)




In [5]:
df["text"] = df["text"].swifter.apply(sp.transform)

Pandas Apply:   0%|          | 0/1190797 [00:00<?, ?it/s]

In [6]:
df = df[df.text.notnull()]
df = df[df.text != '']
df = df[df["text"].apply(lambda x: type(x) == str)]
df = df.reset_index(drop=True)



In [7]:
import csv
# Export df to csv in data folder with name "data_pp.csv"
df.to_csv("data/data_pp.csv", 
            sep = ";", 
            encoding = "UTF-8",
            quotechar = "\"",
            index = False,
            quoting=csv.QUOTE_NONNUMERIC, escapechar="\\")

In [8]:
from classifiers import SpanishClassifier
from tqdm import tqdm
import datetime 
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

tqdm.pandas(desc='Classifying texts')

def predict_label(text, model, file_log):
    try:
        return model.predict(text)
    except Exception as e:
        time = datetime.now().strftime("%d-%Y-%m %H:%M:%S")
        # Write log
        with open(file_log, "a") as f:
            f.write(f"{time}. Error: {e}\n")
            f.write(f"{time}. Text: {text}\n")
            #time.sleep(1)
            # Delete cache GPU    
        return None

file_log = "data/classification_log.txt"

classifiers_names = ["hate_speech"]#, "toxic_speech", "sentiment_analysis", "emotion_analysis", "irony_analysis", "sexist_analysis", "racism_analysis"]
classifiers = {}

for n in classifiers_names:
    classifiers[n] = SpanishClassifier(model_name=n, device=0)

for cl_name in classifiers.keys():
    df[cl_name] = None
    df[cl_name] = df["text"].progress_apply(lambda x: predict_label(x, classifiers[cl_name], file_log))


    # batch = 100_000
    # window = (df.shape[0] // batch)+1
    # for i in tqdm(range(window)):
    #     i_init = i * batch
    #     i_final = i_init + batch
    #     if i_final > df.shape[0]-1:
    #         i_final = df.shape[0]-1
    #     texts = df["text"].values[i_init:i_final].tolist()
    #     predictions = classifiers[cl_name].predict(texts)

    #     # Save prediction in column cl_name of df, with index i_init:i_final
    #     # Create a new dataframe with predictions (list) and index between i_init and i_final. 
    #     df_p = pd.DataFrame(predictions, index=range(i_init:i_final))#index=df.index[i_init:i_final])

    #     #For each column in df_p add prefix "prediction_" to the column name.
    #     df_p.columns = [f"{cl_name}__{col}" for col in df_p.columns]
        
    #     # Merge df_p to df
    #     df = df.merge(df_p, how="left", left_index=True, right_index=True)
        



Classifying texts:   0%|          | 2154/1096848 [00:29<4:08:19, 73.47it/s]


KeyboardInterrupt: 

In [None]:
import torch
torch.cuda.empty_cache()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
df

Unnamed: 0,user_id,date,text,candidato,source,hate_speech,hate_speech__hateful_x,hate_speech__aggressive_x,hate_speech__targeted_x,hate_speech__hateful_y,...,hate_speech__targeted_x.1,hate_speech__aggressive_y,hate_speech__hateful_y.1,hate_speech__targeted_y,hate_speech__hateful_x.1,hate_speech__aggressive_x.1,hate_speech__targeted_x.2,hate_speech__hateful_y.2,hate_speech__aggressive_y.1,hate_speech__targeted_y.1
0,1442114017238028298,2021-11-03 00:00:02,aqui lo que frenara y terminara con estos acto...,Kast,twitter,,0.185040,0.100800,0.006035,0.185040,...,,,,,,,,,,
1,1442114017238028298,2021-11-03 00:03:01,tenemos que unirnos ahora mas que nunca cuidar...,Kast,twitter,,0.111021,0.049120,0.006238,0.111021,...,,,,,,,,,,
2,1442114017238028298,2021-11-03 00:06:37,somos libres somos chile un pais luchador que ...,Kast,twitter,,0.307856,0.109812,0.005927,0.307856,...,,,,,,,,,,
3,1442114017238028298,2021-11-06 08:06:12,y le falto cafiches acomplejados frustrados etc,Kast,twitter,,0.609422,0.277636,0.010943,0.609422,...,,,,,,,,,,
4,1442114017238028298,2021-11-06 08:07:10,aaaaaah mi niño igual ud que tengo un maravill...,Kast,twitter,,0.020156,0.019424,0.021199,0.020156,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096843,56963532690@s.whatsapp.net,2021-12-20 02:46:24,que diga que es en el extranjero,Kast,whatsapp,,,,,,...,,,,,,,,,,
1096844,56999884716@s.whatsapp.net,2021-12-20 02:50:37,se viene,Kast,whatsapp,,,,,,...,,,,,,,,,,
1096845,56999884716@s.whatsapp.net,2021-12-20 02:50:42,una dictadura,Kast,whatsapp,,,,,,...,,,,,,,,,,
1096846,56963532690@s.whatsapp.net,2021-12-20 02:51:08,tranqui,Kast,whatsapp,,,,,,...,,,,,,,,,,
