In [7]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
stop_words = set(stopwords.words('english'))

We will use the GPU: NVIDIA GeForce RTX 3060 Ti


In [2]:
#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, stopwords, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    text =' '.join(word for word in text.split() if len(word) < 14) # remove words longer than 14 characters
    return text

#remove contractions
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the "#" symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as "&" and "$" present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

#Remove multiple sequential spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)

#Stemming
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

#Lemmatization 
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

def deep_clean(text):
    text = strip_emoji(text)
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)
    return text

In [3]:
df = pd.read_csv("./2021_07.csv")

#Get the cleaned tweets
texts_new = []
i = 0
for t in df.text:
    i = i+1
    if i%100000==0:
        print(i)
    texts_new.append(deep_clean(t))
df['text_clean'] = texts_new
df.drop_duplicates("text_clean", inplace=True)
df = df.dropna()

print('Number of dataset sentences: {:,}\n'.format(df.shape[0]))
df.head()

Number of dataset sentences: 23,912



Unnamed: 0,tweets_id,text,author_id,username,user_created_time,user_location,user_name,user_description,geo_place_id,geo_country_code,geo_place_type,geo_full_name,geo_bbox,geo_country,geo_name,date,text_clean
0,1410511826878672896,"Today you r tweeting you, that is twitter true...",784159502107017216,SoooCommonSense,2016-10-06T22:32:49.000Z,LIVERPOOL UNITED KINGDOM,FreedomOfTweet,I believe in freedom 2 b whatever and talk and...,151b9e91272233d1,GB,city,"Liverpool, England","[-3.008791, 53.36489, -2.822063, 53.474867]",Royaume-Uni,Liverpool,2021-07-01,today r tweet twitter truer tweetest true one ...
1,1410511823409926144,Dido Harding has a dismal track record of fail...,460178045,doggypicks,2012-01-10T13:34:53.000Z,England; North Devon.,Nick White 🇨🇦🇬🇧🇪🇺🦮🦡,"Anglo-Canadian, proud European, Disabled OAP; ...",75331c30c4d4ec21,GB,city,"Bickington, England","[-4.106334, 51.061596, -4.056901, 51.078212]",Royaume-Uni,Bickington,2021-07-01,dido hard dismal track record failur yet appli...
2,1410511821849694213,@SueSmithDN @UHMBT @CumbriaUHealth @LawrenceDu...,1887762985,phil_woodford,2013-09-20T20:27:46.000Z,Usually in bed or a bike shop,Phil woodford,"Director in the NHS and Stroke Survivor, chart...",00d86ee11c1c8559,GB,city,"Catterall, England","[-2.7711841, 53.873757, -2.757798, 53.88333]",Royaume-Uni,Catterall,2021-07-01,even better know dont like egg
3,1410511821048590340,🏏 @AecProtection Stadium Pitch Incursion Respo...,1022490061215096832,AecProtection,2018-07-26T14:33:22.000Z,"Romsey, Hampshire",AEC Protection,"AEC Protection is a well Established, Professi...",07d9c97757886001,GB,poi,The Ageas Bowl,"[-1.3222217559814453, 50.923975582580454, -1.3...",Royaume-Uni,The Ageas Bowl,2021-07-01,stadium pitch incurs respons team member discu...
4,1410511821048401923,Just posted a photo @ The Gold Bar https://t.c...,74503262,felipehponce,2009-09-15T17:34:08.000Z,"Bristol, England",FelipePonce,Marketing Strategist 😀 Content Creator,7f15dd80ac78ef40,GB,city,"Bristol, England","[-2.659936, 51.399367, -2.510844, 51.516387]",Royaume-Uni,Bristol,2021-07-01,post photo gold bar


In [4]:
text_len = []
for text in df.text_clean:
    tweet_len = len(text.split())
    text_len.append(tweet_len)
df['text_len'] = text_len
df = df[df['text_len'] > 3]
df = df[df['text_len'] < 100]
max_len = np.max(df['text_len'])

In [8]:
def Tokenize(column, seq_len):
    ##Create vocabulary of words from column
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common()
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

    ##Tokenize the columns text using the vocabulary
    text_int = []
    for text in column:
        r = [vocab_to_int[word] for word in text.split()]
        text_int.append(r)
    ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return sorted_words, features

In [9]:
vocabulary, tokenized_column = Tokenize(df["text_clean"], max_len)
print(df["text_clean"].iloc[0])
print(tokenized_column[0])

today r tweet twitter truer tweetest true one tweet aliv tweet yourer tweet dare2diff mha21 rt ifb
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0   26
 1287  188  228 6771 6772  249    5  188 1048  188 9724  188 9725 9726
 1686 6773]


In [11]:
tokenized_column = np.array(tokenized_column)
data = TensorDataset(torch.from_numpy(tokenized_column))
BATCH_SIZE = 32
data_loader = DataLoader(data, shuffle=False, batch_size=BATCH_SIZE, drop_last=False)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
class BiLSTM_Sentiment_Classifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, lstm_layers, bidirectional,batch_size, dropout):
        super(BiLSTM_Sentiment_Classifier,self).__init__()
        
        self.lstm_layers = lstm_layers
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.batch_size = batch_size
        

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            dropout=dropout,
                            bidirectional=bidirectional,
                            batch_first=True)

        self.fc = nn.Linear(hidden_dim*self.num_directions, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden):
        self.batch_size = x.size(0)
        embedded = self.embedding(x)
        out, hidden = self.lstm(embedded, hidden)
        out = out[:,-1,:]
        out = self.fc(out)
        out = self.softmax(out)

        return out, hidden

    def init_hidden(self, batch_size):
        #Initialization of the LSTM hidden and cell states
        h0 = torch.zeros((self.lstm_layers*self.num_directions, batch_size, self.hidden_dim)).detach().to(DEVICE)
        c0 = torch.zeros((self.lstm_layers*self.num_directions, batch_size, self.hidden_dim)).detach().to(DEVICE)
        hidden = (h0, c0)
        return hidden

In [19]:
NUM_CLASSES = 5 
VOCAB_SIZE = 33009
HIDDEN_DIM = 100 
LSTM_LAYERS = 1
DROPOUT = 0.5
BIDIRECTIONAL = True 
EMBEDDING_DIM = 200

In [20]:
model = BiLSTM_Sentiment_Classifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,NUM_CLASSES, LSTM_LAYERS,BIDIRECTIONAL, BATCH_SIZE, DROPOUT)
model.load_state_dict(torch.load("./models/LSTM.pt"))
# Tell pytorch to run this model on the GPU.
model.cuda()

BiLSTM_Sentiment_Classifier(
  (embedding): Embedding(33009, 200)
  (lstm): LSTM(200, 100, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=200, out_features=5, bias=True)
  (softmax): LogSoftmax(dim=1)
)

In [37]:
y_val_list = []
p_threshold = 0.9
labels = []
with torch.no_grad():

    model.eval()

    for inputs in data_loader:
        inputs = inputs[0].to(DEVICE)
        val_h = model.init_hidden(inputs.size(0))
            
        
        output, val_h = model(inputs, val_h)
        p = nn.functional.softmax(output, dim=1)
        p = p.detach().cpu().numpy()
        
        # # 'religion':0,'age':1,'ethnicity':2,'gender':3,'not_cyberbullying':4
        for line in p:
            if line[0] > p_threshold:
                labels.append(0)
            elif line[1] > p_threshold:
                labels.append(1)
            elif line[2] > p_threshold:
                labels.append(2)
            elif line[3] > p_threshold:
                labels.append(3)
            else:
                labels.append(4)

In [38]:
print(len(df))
print(len(labels))

17710
17710


In [40]:
df['label'] = labels
df.to_csv("2021_07_LSTM.csv", encoding='utf-8')