In [None]:
import pandas as pd 
pd.set_option("display.max_colwidth", 400)
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk # for text manipulation
from nltk.corpus import stopwords
import preprocessor as p
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [None]:
depression_df = pd.read_csv("./tweets_2019/depression/hate_depression.csv")

In [None]:
#depression_df.info()

In [None]:
depression_df.drop(["content","id","user","outlinks","tcooutlinks","replyCount","retweetCount","likeCount","quoteCount","conversationId","lang","source","sourceUrl","sourceLabel","media","retweetedTweet","quotedTweet","mentionedUsers"] ,axis=1, inplace=True)

In [None]:
depression_df.head(10)

In [None]:
p.set_options(p.OPT.URL, p.OPT.EMOJI)
p.clean('Preprocessor is #awesome 👍 https://github.com/s/preprocessor')

In [None]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.NUMBER)
depression_df['pre_proccessed'] = depression_df['renderedContent'].apply(lambda x: p.clean(x)).str.lower()
#depression_df.head(10)

In [None]:
#removing punctuations, numbers and special characters
#regular expression = any character that's not az or AZ or #
depression_df['pre_proccessed'] = depression_df['pre_proccessed'].str.replace("[^a-zA-Z#]", " ")

#removing short words
depression_df['pre_proccessed'] = depression_df['pre_proccessed'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

depression_df.head(10) 

In [None]:
nltk.download('stopwords')

In [None]:
stopwords_english = stopwords.words('english') 

print('Stop words\n')
print(stopwords_english)

In [None]:
tokenized_tweet = depression_df['pre_proccessed'].apply(lambda x: x.split()) # tokenizing
#tokenized_tweet[:10]

In [None]:
# print(tokenized_tweet[:10])
tweets_clean_dict = {}
for num, sentence in enumerate(tokenized_tweet): #Go through every tweet in the list of tweets 
    tweets_clean = [] #create an empty list to save the cleaned tweet for each tweet 
    for word in sentence: #go through every word inside the "dirty" tweet and keep only the ones you want
        if (word not in stopwords_english): 
            tweets_clean.append(word)
    tweets_clean_dict[num] = tweets_clean #save the list of "cleaned" words (cleaned tweet) into a dict in the position num (0,1,2,... ,len(tweets))
tweets_clean_dict

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [None]:
TreebankWordDetokenizer().detokenize(tweets_clean_dict[1])

In [None]:
detokenized_dict = {}
for key, items in tweets_clean_dict.items():
    detokenized_dict[key] = TreebankWordDetokenizer().detokenize(tweets_clean_dict[key])
# detokenized_dict


In [None]:
detokenized_dict[0]

In [None]:
detokenized_df = pd.DataFrame.from_dict(detokenized_dict, orient='index')

In [None]:
detokenized_df.head(10)
detokenized_df.rename(columns={0:'pre_processed'}, inplace=True)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]
labels

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# esta function retorna los labaels del modelo (negativo, neutral, positivo) 
# recibe un texto de input y retorna string (.. ) 
# es paa usarl con el apply y lambda funciton de pandas 
def return_labels(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    label = labels[ranking[0]]
    return label 

In [None]:
# esta function retorna los probabilidades (para analizar que tan "seguro" esta el modelo de las predicciones) 
# del modelo (negativo, neutral, positivo) 
# recibe un texto de input y retorna string (.. ) 
# es para usar con el apply y lambda function de pandas 
def return_prob(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    return np.sort(scores)[-1]

In [None]:
detokenized_df['label_sentiment'] = detokenized_df['pre_processed'].apply(lambda x: return_labels(x))
detokenized_df['probability_sentiment'] = detokenized_df['pre_processed'].apply(lambda x: return_prob(x))
detokenized_df

In [None]:
official_df = pd.concat([depression_df[['renderedContent']], detokenized_df], axis = 1)
official_df