# Twitter Sentiment Analysis

**By Neuromatch Academy**

__Content creators:__  Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri

__Production editors:__ Amita Kapoor, Spiros Chavlis

---
# Step 1: Questions and goals

* Can we infer emotion from a tweet text?
* How words are distributed accross the dataset?
* Are words related to one kind of emotion?

---
# Step 2: Literature review

[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)

[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)

---
# Step 3: Load and explore the dataset

In [None]:
# @title Install dependencies
!pip install pandas --quiet
!pip install torchtext --quiet


In [None]:
# We import some libraries to load the dataset
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset

import json
from tqdm import tqdm

import torchtext
from torchtext.data import get_tokenizer

from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


BERT models are trained for punctuations and uppercase alphabets, dont remove it during preprocessing


Remove links and usernames , check for the function needed

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import RobertaModel, RobertaTokenizer
from scipy.special import softmax
import csv
import urllib.request

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Sentiment_Analysis_Project/Tweets.csv')
df_prep = df[df['sentiment']=='negative']
df_prep.head()

Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
12,74a76f6e0a,My Sharpie is running DANGERously low on ink,DANGERously,negative


In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    # Convert to string if the datatype is different
    text = str(text)
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
df_prep.text = df_prep.text.apply(lambda x: preprocess(x))
df_prep.text
df_prep

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
12,74a76f6e0a,My Sharpie is running DANGERously low on ink,DANGERously,negative
...,...,...,...,...
27464,e7eecffdc8,rec game....trying not to cry...the pain is to...,breakingg,negative
27470,778184dff1,lol i know and haha..did you fall asleep?? o...,t bored,negative
27472,8f5adc47ec,http - Wanted to visit the animals but we were...,were too late,negative
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative


In [None]:
# Randomly selecting subset of the texts in the df
subset = df_prep.sample(frac= 0.2, random_state=1, replace = True)

In [None]:
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
    
labels = [row[1] for row in csvreader if len(row) > 1]


In [None]:
task='emotion'
MODEL = "cardiffnlp/twitter-roberta-base-emotion"

tokenizer = AutoTokenizer.from_pretrained(MODEL,model_max_length=512)


# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

In [None]:
anger = []
joy = []
optimism = []
sadness = []

for text in subset.text.tolist():
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    anger.append(scores[0])
    joy.append(scores[1]) 
    optimism.append(scores[2])
    sadness.append(scores[3])   

In [None]:
new_df = pd.DataFrame({"Original_Index": subset.textID.tolist(), "Text": subset.text.tolist(),"Predicted_Angry_Values": anger})
new_df

Unnamed: 0,Original_Index,Text,Predicted_Angry_Values
0,fc8b4e690e,Damnit Day 26 is at Northpark mall..I use to l...,0.029002
1,66521b4652,i have no idea what im doing and i am complete...,0.035865
2,b5fa3b05b0,http - *sniff* i feel so left out! *grin*,0.017373
3,895d099dd1,is ur boyfriend better than mine? hell no!,0.874202
4,68c06541ce,Bored and lonely at work,0.010366
...,...,...,...
1551,e6dd04089e,yeah exactly the fans overpower the haters an...,0.900337
1552,baa5ccb680,im home alone in the house and imma scared x,0.014016
1553,795b7b9ec3,Poor . The flowers are fornicating with HIS n...,0.085109
1554,8472323928,no so sad about that i`m from MALTA have you ...,0.006065


In [None]:
# Threshold
# Greater than 0.7 = Angry (1) 
# Less than 0.7 = Not Angry (0)
labels = []
for i in new_df.Predicted_Angry_Values:
    if i < 0.7:
        labels.append(0)
    else:
        labels.append(1)

In [None]:
labels

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,


In [None]:
new_df['labels'] = labels

In [None]:
new_df

Unnamed: 0,Original_Index,Text,Predicted_Angry_Values,labels
0,fc8b4e690e,Damnit Day 26 is at Northpark mall..I use to l...,0.029002,0
1,66521b4652,i have no idea what im doing and i am complete...,0.035865,0
2,b5fa3b05b0,http - *sniff* i feel so left out! *grin*,0.017373,0
3,895d099dd1,is ur boyfriend better than mine? hell no!,0.874202,1
4,68c06541ce,Bored and lonely at work,0.010366,0
...,...,...,...,...
1551,e6dd04089e,yeah exactly the fans overpower the haters an...,0.900337,1
1552,baa5ccb680,im home alone in the house and imma scared x,0.014016,0
1553,795b7b9ec3,Poor . The flowers are fornicating with HIS n...,0.085109,0
1554,8472323928,no so sad about that i`m from MALTA have you ...,0.006065,0


In [None]:
new_df[new_df.labels == 0].count()

Original_Index            1288
Text                      1288
Predicted_Angry_Values    1288
labels                    1288
dtype: int64

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05 #0.01 or 0.05 or 0.001 or 0.025
tokenizer = RobertaTokenizer.from_pretrained(MODEL, truncation=True, do_lower_case=True)

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.7
train_data=new_df.sample(frac=train_size,random_state=1)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1556, 4)
TRAIN Dataset: (1089, 4)
TEST Dataset: (467, 4)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Creating NN

In [None]:
#model.to(device)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained(MODEL)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [None]:
def get_class_weights(y_train):
    
    dict_train_classes = Counter(y_train)

    class_count = [dict_train_classes[i] for i in range(len(dict_train_classes))]
    class_weights = 1./torch.tensor(class_count, dtype=torch.float32)
    class_weights.type(torch.float32)
    
    return class_weights

In [None]:
## Fine Tune Model
import torch.nn as nn

#class weights for binary class classification
# class_weights = [0.8277635, 0.172236]
# Creating the loss function and optimizer
loss_function = nn.CrossEntropyLoss(weight = get_class_weights(new_df.labels).to(device,dtype=torch.float32))

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
print(len(list(enumerate(training_loader, 0))))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


69


In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epochs):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for epoch in range(epochs):
        for _,data in tqdm(enumerate(training_loader, 0)):
            ids = data['ids'].to(device,dtype=torch.long)
            mask = data['mask'].to(device,dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device,dtype=torch.long)
            targets = data['targets'].to(device,dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            # if _%150==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 150 steps: {loss_step}")
            print(f"Training Accuracy per 150 steps: {accu_step}")

            optimizer.zero_grad()
            loss.backward()
            # # When using GPU
            optimizer.step()
        
    nb_tr_examples = train_size
    nb_tr_steps = (train_size // TRAIN_BATCH_SIZE)+1
    print(f'The Total Accuracy for Epoch {epochs}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
EPOCHS = 3 # make it 100 
# import transformers
# transformers.TrainingArguments(num_train_epochs=2,output_dir="")
for epoch in range(EPOCHS):
    train(epoch)

The Total Accuracy for Epoch 0: 0.0
Training Loss Epoch: 0.0
Training Accuracy Epoch: 0.0


0it [00:00, ?it/s]


RuntimeError: ignored

In [None]:
### Validation

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%50==0: # 70-80
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 50 steps: {loss_step}")
                print(f"Validation Accuracy per 50 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

1it [00:04,  4.01s/it]

Validation Loss per 50 steps: 0.04947724938392639
Validation Accuracy per 50 steps: 100.0


51it [02:33,  2.89s/it]

Validation Loss per 50 steps: 0.1444336045716031
Validation Accuracy per 50 steps: 95.09803921568627


101it [05:00,  2.87s/it]

Validation Loss per 50 steps: 0.1451419404071599
Validation Accuracy per 50 steps: 94.05940594059406


151it [07:25,  2.87s/it]

Validation Loss per 50 steps: 0.15357861595092626
Validation Accuracy per 50 steps: 94.5364238410596


201it [09:51,  2.87s/it]

Validation Loss per 50 steps: 0.14655140867058317
Validation Accuracy per 50 steps: 94.65174129353234


251it [12:16,  2.88s/it]

Validation Loss per 50 steps: 0.1340377499123078
Validation Accuracy per 50 steps: 95.11952191235059


292it [14:15,  2.93s/it]

Validation Loss Epoch: 0.13453946218902424
Validation Accuracy Epoch: 95.11568123393316
Accuracy on test data = 95.12%





In [None]:
from sklearn.metrics import f1_score
f1_score()