In [46]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

We will use the GPU: NVIDIA GeForce RTX 3060 Ti


In [47]:
#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, stopwords, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    text =' '.join(word for word in text.split() if len(word) < 14) # remove words longer than 14 characters
    return text

#remove contractions
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the "#" symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as "&" and "$" present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

#Remove multiple sequential spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)

#Stemming
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

#Lemmatization 
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

def deep_clean(text):
    text = strip_emoji(text)
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)
    return text

In [48]:
df = pd.read_csv("./2022-02-24.csv")

#Get the cleaned tweets
texts_new = []
i = 0
for t in df.text:
    i = i+1
    if i%100000==0:
        print(i)
    texts_new.append(deep_clean(t))
df['text_clean'] = texts_new
df.drop_duplicates("text_clean", inplace=True)
df = df.dropna()

print('Number of dataset sentences: {:,}\n'.format(df.shape[0]))
sentences = df.text_clean.values
df.head()

Number of dataset sentences: 9,502



Unnamed: 0,tweets_id,text,author_id,username,user_created_time,user_location,user_name,user_description,geo_place_id,geo_country_code,geo_place_type,geo_full_name,geo_bbox,geo_country,geo_name,date,text_clean
0,1496649802364375041,If you took away every other amazing thing fro...,387460294,AngiNicole722,2011-10-09T03:16:25.000Z,"Springfield, MO",Angi B the writing machine,Director of Illumination Dance Company at SLT....,2526edd24c06e60c,US,admin,"Missouri, USA","[-95.774704, 35.995476, -89.098843, 40.613641]",Etats-Unis,Missouri,2022-02-24,took away everi amaz thing music effect storyt...
1,1496649802297188352,"@hauiebeast And by people, do you mean.. https...",16579078,SimoneAJordan,2008-10-03T15:27:20.000Z,"Sydney, AU",Simone Amelia Jordan,Writer | Journalist\nDirector Of Special Proje...,0073b76548e5984f,AU,city,"Sydney, New South Wales","[150.520928608, -34.1183470085, 151.343020992,...",Australia,Sydney,2022-02-24,peopl mean
2,1496649801227665411,indubitably https://t.co/RZlllweleI,381884964,AmardeepMcFly,2011-09-29T03:21:39.000Z,"san francisco, ca",OD God Shammgod,air dropper extraordinaire. #overdraft,5a110d312052166f,US,city,"San Francisco, CA","[-122.514926, 37.708075, -122.357031, 37.833238]",Etats-Unis,San Francisco,2022-02-24,indubit
3,1496649801185869824,"Just posted a photo @ Detroit, Michigan https:...",1353915508987469825,GezzusKrice,2021-01-26T04:00:24.000Z,"Detroit, MI",Dr. UMark Johnson II #7mileCade #SmokingOnTop5s,#GezzusKrice \n#everybodyhatessleeppodcast\n#h...,b463d3bd6064861b,US,city,"Detroit, MI","[-83.288056, 42.255085, -82.91052, 42.450488]",Etats-Unis,Detroit,2022-02-24,post photo detroit michigan
4,1496649800195973121,@DUDEwipes might be your biggest fan. 🤣 https:...,147783195,jlovell5,2010-05-25T01:39:55.000Z,"College Station, TX",Jeremy Lovell,Alabama ✈️ Texas,00a4bbcc0dcd7572,US,city,"College Station, TX","[-96.37703, 30.520359, -96.206267, 30.6505837]",Etats-Unis,College Station,2022-02-24,might biggest fan


In [49]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  took away everi amaz thing music effect storytel use would still make one best show tv
Tokenized:  ['took', 'away', 'ever', '##i', 'ama', '##z', 'thing', 'music', 'effect', 'story', '##tel', 'use', 'would', 'still', 'make', 'one', 'best', 'show', 'tv']
Token IDs:  [2165, 2185, 2412, 2072, 25933, 2480, 2518, 2189, 3466, 2466, 9834, 2224, 2052, 2145, 2191, 2028, 2190, 2265, 2694]


In [None]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [50]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 80,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [51]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  took away everi amaz thing music effect storytel use would still make one best show tv
Token IDs: tensor([  101,  2165,  2185,  2412,  2072, 25933,  2480,  2518,  2189,  3466,
         2466,  9834,  2224,  2052,  2145,  2191,  2028,  2190,  2265,  2694,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [52]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataset = TensorDataset(input_ids, attention_masks)
batch_size = 32
dataloader = DataLoader(
            dataset,  
            batch_size = batch_size
        )


In [53]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 6, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    return_dict=False,
)

model.load_state_dict(torch.load("./models/BERT_best_with_5.pt"))
# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [54]:
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
labels = []
p_threshold = 0.9

for batch in dataloader:

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask)[0]
        p = nn.functional.softmax(logits, dim=1)
        p = p.detach().cpu().numpy()
        
        # 0 for negative， 1 for positive, 2 for neutral
        for line in p:
            if line[0] > p_threshold:
                labels.append(0)
            elif line[1] > p_threshold:
                labels.append(1)
            elif line[2] > p_threshold:
                labels.append(1)
            elif line[3] > p_threshold:
                labels.append(1)
            elif line[5] > p_threshold:
                labels.append(5)    
            else:
                labels.append(4)

In [55]:
print(len(df))
print(len(sentences))
print(len(labels))

9502
9502
9502


In [56]:
df['labels'] = labels
df.head()

Unnamed: 0,tweets_id,text,author_id,username,user_created_time,user_location,user_name,user_description,geo_place_id,geo_country_code,geo_place_type,geo_full_name,geo_bbox,geo_country,geo_name,date,text_clean,labels
0,1496649802364375041,If you took away every other amazing thing fro...,387460294,AngiNicole722,2011-10-09T03:16:25.000Z,"Springfield, MO",Angi B the writing machine,Director of Illumination Dance Company at SLT....,2526edd24c06e60c,US,admin,"Missouri, USA","[-95.774704, 35.995476, -89.098843, 40.613641]",Etats-Unis,Missouri,2022-02-24,took away everi amaz thing music effect storyt...,4
1,1496649802297188352,"@hauiebeast And by people, do you mean.. https...",16579078,SimoneAJordan,2008-10-03T15:27:20.000Z,"Sydney, AU",Simone Amelia Jordan,Writer | Journalist\nDirector Of Special Proje...,0073b76548e5984f,AU,city,"Sydney, New South Wales","[150.520928608, -34.1183470085, 151.343020992,...",Australia,Sydney,2022-02-24,peopl mean,4
2,1496649801227665411,indubitably https://t.co/RZlllweleI,381884964,AmardeepMcFly,2011-09-29T03:21:39.000Z,"san francisco, ca",OD God Shammgod,air dropper extraordinaire. #overdraft,5a110d312052166f,US,city,"San Francisco, CA","[-122.514926, 37.708075, -122.357031, 37.833238]",Etats-Unis,San Francisco,2022-02-24,indubit,4
3,1496649801185869824,"Just posted a photo @ Detroit, Michigan https:...",1353915508987469825,GezzusKrice,2021-01-26T04:00:24.000Z,"Detroit, MI",Dr. UMark Johnson II #7mileCade #SmokingOnTop5s,#GezzusKrice \n#everybodyhatessleeppodcast\n#h...,b463d3bd6064861b,US,city,"Detroit, MI","[-83.288056, 42.255085, -82.91052, 42.450488]",Etats-Unis,Detroit,2022-02-24,post photo detroit michigan,4
4,1496649800195973121,@DUDEwipes might be your biggest fan. 🤣 https:...,147783195,jlovell5,2010-05-25T01:39:55.000Z,"College Station, TX",Jeremy Lovell,Alabama ✈️ Texas,00a4bbcc0dcd7572,US,city,"College Station, TX","[-96.37703, 30.520359, -96.206267, 30.6505837]",Etats-Unis,College Station,2022-02-24,might biggest fan,4


In [57]:
df.labels.value_counts()

4    8941
1     274
0     151
5     136
Name: labels, dtype: int64