In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

We will use the GPU: NVIDIA GeForce RTX 3060 Ti


In [2]:
#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, stopwords, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    text =' '.join(word for word in text.split() if len(word) < 14) # remove words longer than 14 characters
    return text

#remove contractions
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the "#" symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as "&" and "$" present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

#Remove multiple sequential spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)

#Stemming
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

#Lemmatization 
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

def deep_clean(text):
    text = strip_emoji(text)
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)
    return text

In [6]:
df = pd.read_csv("./2021_07.csv")

#Get the cleaned tweets
texts_new = []
i = 0
for t in df.text:
    i = i+1
    if i%100000==0:
        print(i)
    texts_new.append(deep_clean(t))
df['text_clean'] = texts_new
df.drop_duplicates("text_clean", inplace=True)
df = df.dropna()

print('Number of dataset sentences: {:,}\n'.format(df.shape[0]))
sentences = df.text_clean.values
df.head()

Number of dataset sentences: 23,912



Unnamed: 0,tweets_id,text,author_id,username,user_created_time,user_location,user_name,user_description,geo_place_id,geo_country_code,geo_place_type,geo_full_name,geo_bbox,geo_country,geo_name,date,text_clean
0,1410511826878672896,"Today you r tweeting you, that is twitter true...",784159502107017216,SoooCommonSense,2016-10-06T22:32:49.000Z,LIVERPOOL UNITED KINGDOM,FreedomOfTweet,I believe in freedom 2 b whatever and talk and...,151b9e91272233d1,GB,city,"Liverpool, England","[-3.008791, 53.36489, -2.822063, 53.474867]",Royaume-Uni,Liverpool,2021-07-01,today r tweet twitter truer tweetest true one ...
1,1410511823409926144,Dido Harding has a dismal track record of fail...,460178045,doggypicks,2012-01-10T13:34:53.000Z,England; North Devon.,Nick White 🇨🇦🇬🇧🇪🇺🦮🦡,"Anglo-Canadian, proud European, Disabled OAP; ...",75331c30c4d4ec21,GB,city,"Bickington, England","[-4.106334, 51.061596, -4.056901, 51.078212]",Royaume-Uni,Bickington,2021-07-01,dido hard dismal track record failur yet appli...
2,1410511821849694213,@SueSmithDN @UHMBT @CumbriaUHealth @LawrenceDu...,1887762985,phil_woodford,2013-09-20T20:27:46.000Z,Usually in bed or a bike shop,Phil woodford,"Director in the NHS and Stroke Survivor, chart...",00d86ee11c1c8559,GB,city,"Catterall, England","[-2.7711841, 53.873757, -2.757798, 53.88333]",Royaume-Uni,Catterall,2021-07-01,even better know dont like egg
3,1410511821048590340,🏏 @AecProtection Stadium Pitch Incursion Respo...,1022490061215096832,AecProtection,2018-07-26T14:33:22.000Z,"Romsey, Hampshire",AEC Protection,"AEC Protection is a well Established, Professi...",07d9c97757886001,GB,poi,The Ageas Bowl,"[-1.3222217559814453, 50.923975582580454, -1.3...",Royaume-Uni,The Ageas Bowl,2021-07-01,stadium pitch incurs respons team member discu...
4,1410511821048401923,Just posted a photo @ The Gold Bar https://t.c...,74503262,felipehponce,2009-09-15T17:34:08.000Z,"Bristol, England",FelipePonce,Marketing Strategist 😀 Content Creator,7f15dd80ac78ef40,GB,city,"Bristol, England","[-2.659936, 51.399367, -2.510844, 51.516387]",Royaume-Uni,Bristol,2021-07-01,post photo gold bar


In [7]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  today r tweet twitter truer tweetest true one tweet aliv tweet yourer tweet dare2diff mha21 rt ifb
Tokenized:  ['today', 'r', 't', '##wee', '##t', 'twitter', 'true', '##r', 't', '##wee', '##test', 'true', 'one', 't', '##wee', '##t', 'ali', '##v', 't', '##wee', '##t', 'your', '##er', 't', '##wee', '##t', 'dare', '##2', '##di', '##ff', 'm', '##ha', '##21', 'rt', 'if', '##b']
Token IDs:  [2651, 1054, 1056, 28394, 2102, 10474, 2995, 2099, 1056, 28394, 22199, 2995, 2028, 1056, 28394, 2102, 4862, 2615, 1056, 28394, 2102, 2115, 2121, 1056, 28394, 2102, 8108, 2475, 4305, 4246, 1049, 3270, 17465, 19387, 2065, 2497]


In [8]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  75


In [12]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 80,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

In [13]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  today r tweet twitter truer tweetest true one tweet aliv tweet yourer tweet dare2diff mha21 rt ifb
Token IDs: tensor([  101,  2651,  1054,  1056, 28394,  2102, 10474,  2995,  2099,  1056,
        28394, 22199,  2995,  2028,  1056, 28394,  2102,  4862,  2615,  1056,
        28394,  2102,  2115,  2121,  1056, 28394,  2102,  8108,  2475,  4305,
         4246,  1049,  3270, 17465, 19387,  2065,  2497,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [52]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataset = TensorDataset(input_ids, attention_masks)
batch_size = 32
dataloader = DataLoader(
            dataset,  
            batch_size = batch_size
        )


In [53]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    return_dict=False,
)

model.load_state_dict(torch.load("./model/BERT.pt"))
# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [64]:
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
labels = []
p_threshold = 0.9

for batch in dataloader:

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask)[0]
        p = nn.functional.softmax(logits, dim=1)
        p = p.detach().cpu().numpy()
        
        # 0 for negative， 1 for positive, 2 for neutral
        for line in p:
            if line[0] > p_threshold:
                labels.append(0)
            elif line[1] > p_threshold:
                labels.append(1)
            else:
                labels.append(2)

23912

In [65]:
print(len(df))
print(len(sentences))
print(len(labels))

23912
23912
23912


In [67]:
df['labels'] = labels
df.head()

Unnamed: 0,tweets_id,text,author_id,username,user_created_time,user_location,user_name,user_description,geo_place_id,geo_country_code,geo_place_type,geo_full_name,geo_bbox,geo_country,geo_name,date,text_clean,labels
0,1410511826878672896,"Today you r tweeting you, that is twitter true...",784159502107017216,SoooCommonSense,2016-10-06T22:32:49.000Z,LIVERPOOL UNITED KINGDOM,FreedomOfTweet,I believe in freedom 2 b whatever and talk and...,151b9e91272233d1,GB,city,"Liverpool, England","[-3.008791, 53.36489, -2.822063, 53.474867]",Royaume-Uni,Liverpool,2021-07-01,today r tweet twitter truer tweetest true one ...,2
1,1410511823409926144,Dido Harding has a dismal track record of fail...,460178045,doggypicks,2012-01-10T13:34:53.000Z,England; North Devon.,Nick White 🇨🇦🇬🇧🇪🇺🦮🦡,"Anglo-Canadian, proud European, Disabled OAP; ...",75331c30c4d4ec21,GB,city,"Bickington, England","[-4.106334, 51.061596, -4.056901, 51.078212]",Royaume-Uni,Bickington,2021-07-01,dido hard dismal track record failur yet appli...,1
2,1410511821849694213,@SueSmithDN @UHMBT @CumbriaUHealth @LawrenceDu...,1887762985,phil_woodford,2013-09-20T20:27:46.000Z,Usually in bed or a bike shop,Phil woodford,"Director in the NHS and Stroke Survivor, chart...",00d86ee11c1c8559,GB,city,"Catterall, England","[-2.7711841, 53.873757, -2.757798, 53.88333]",Royaume-Uni,Catterall,2021-07-01,even better know dont like egg,1
3,1410511821048590340,🏏 @AecProtection Stadium Pitch Incursion Respo...,1022490061215096832,AecProtection,2018-07-26T14:33:22.000Z,"Romsey, Hampshire",AEC Protection,"AEC Protection is a well Established, Professi...",07d9c97757886001,GB,poi,The Ageas Bowl,"[-1.3222217559814453, 50.923975582580454, -1.3...",Royaume-Uni,The Ageas Bowl,2021-07-01,stadium pitch incurs respons team member discu...,1
4,1410511821048401923,Just posted a photo @ The Gold Bar https://t.c...,74503262,felipehponce,2009-09-15T17:34:08.000Z,"Bristol, England",FelipePonce,Marketing Strategist 😀 Content Creator,7f15dd80ac78ef40,GB,city,"Bristol, England","[-2.659936, 51.399367, -2.510844, 51.516387]",Royaume-Uni,Bristol,2021-07-01,post photo gold bar,1


In [68]:
df.to_csv("2021_07_labeled.csv", encoding='utf-8')