In [1]:
import pandas as pd

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import gensim.downloader
from scipy.sparse import hstack
import joblib
from torchsummary import summary

import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

import time

In [2]:
annotated_data = pd.read_csv('data/incomplete_annotations_data2.csv')

# annotated_data = full_data[full_data['Subjectivity'].notnull()]
# unannotated_data = full_data[full_data['Subjectivity'].isnull()]

In [3]:
annotated_data['Subjectivity'].value_counts()

Subjectivity
1.0    1560
0.0    1123
Name: count, dtype: int64

In [4]:
annotated_data['Polarity'].value_counts()

Polarity
1.0    876
0.0    684
Name: count, dtype: int64

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
print(torch.__version__)

2.2.2


# Preprocessing Data
- Lowercasing
- Removing stopwords
- Replacing emoji and slang/abbreviations with their text counterparts

- Mispellings

In [7]:
annotated_data[annotated_data['Comment'].isnull()]

Unnamed: 0,Brand,Search Term,Comment,Source,Metadata,Subjectivity,Polarity,Subjectivity 2,Polarity 2
2590,JW Anderson,JW Anderson,,Instagram,{'Likes_and_timestamp': '0 likes on 2023-11-20...,0.0,,1,0


In [8]:
annotated_data = annotated_data.dropna(subset=['Comment'])
print(annotated_data.isnull().sum())

Brand              103
Search Term        174
Comment              0
Source               0
Metadata           234
Subjectivity         0
Polarity          1122
Subjectivity 2       0
Polarity 2           0
dtype: int64


In [9]:
with open('abbreviations_list.pkl', 'rb') as file:
    abbreviations = pickle.load(file)

print(abbreviations)

{"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'migh

In [10]:
# Creating extra column for preprocessed text
annotated_data['Preprocessed Comment'] = annotated_data['Comment']

In [11]:
# Normalizing emojis
import emoji

def demojize_with_delimiters(text):
    return emoji.demojize(text, delimiters=(" ", " "))

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(lambda x: demojize_with_delimiters(x) if isinstance(x, str) else x)

In [12]:
# Lowercasing

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(lambda x: x.lower() if isinstance(x, str) else x)


In [13]:
# Removing stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    # Ensure the input is a string
    if isinstance(text, str):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)
        
        # Get the list of stopwords
        stop_words = set(stopwords.words('english'))
        
        # Remove stopwords from the tokenized words
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Join the filtered words back into a single string
        filtered_text = ' '.join(filtered_words)
        
        return filtered_text
    else:  
        return text

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Louis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Louis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Function to manually tokenize text including punctuations
def custom_tokenize(text):
    # Regex pattern to match words (including contractions) and separate punctuation
    tokens = re.findall(r"[\w']+|[.,!?;]", text)
    return tokens

# Normalize slangs and abbreviations
def normalize_slangs_abbreviations_custom(text, slang_dict):
    if isinstance(text, str):
        tokens = custom_tokenize(text)
        normalized_tokens = [slang_dict.get(token.lower(), token) for token in tokens]
        # Reconstruct the text
        normalized_text = ' '.join(normalized_tokens).replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?")
        return normalized_text
    else:
        return text

annotated_data['Preprocessed Comment'] = annotated_data['Preprocessed Comment'].apply(lambda x: normalize_slangs_abbreviations_custom(x, abbreviations))

In [15]:
# annotated_data = full_data[full_data['Subjectivity'].notnull()]
# unannotated_data = full_data[full_data['Subjectivity'].isnull()]

In [16]:
annotated_data

Unnamed: 0,Brand,Search Term,Comment,Source,Metadata,Subjectivity,Polarity,Subjectivity 2,Polarity 2,Preprocessed Comment
0,Nike,waste,Designing products with sustainability in mind...,Twitter,"{'Name': 'Angla Sicurella', 'Handle': '@AnglaS...",0.0,,1,0,"designing products sustainability mind, like n..."
1,Nike,waste,Kirby would have been a waste of time - why ev...,Twitter,"{'Name': 'LisaKingWheless', 'Handle': '@Lisapc...",1.0,0.0,1,1,kirby would waste time even ask? plus adds coa...
2,Nike,waste,I wouldn’t spend another dollar at that theate...,Twitter,"{'Name': 'Sheila McSheilerton', 'Handle': '@sh...",1.0,0.0,1,1,spend another dollar theater. like buy nike gr...
3,Nike,waste,Call them back and tell them they’re lying bec...,Twitter,"{'Name': 'UncleChrissy', 'Handle': '@uncle_chr...",1.0,0.0,1,1,call back tell lying already. trying get real ...
4,Nike,waste,I’m really sitting here going in on myself..li...,Twitter,"{'Name': 'Jade ☥', 'Handle': '@jmerarity', 'Ti...",1.0,1.0,0,0,really sitting going.. like really going let b...
...,...,...,...,...,...,...,...,...,...,...
2678,Louis Vuitton,Louis Vuitton,❤️❤️❤️,Instagram,{'Likes_and_timestamp': '0 likes on 2024-01-17...,1.0,1.0,0,0,red_heart red_heart red_heart
2679,Tory Burch,Tory Burch,The pale pink in the 6th look is EVERYTHINGGGG...,Instagram,{'Likes_and_timestamp': '0 likes on 2023-09-16...,1.0,1.0,1,1,pale pink 6th look everythinggggg. cherry_blossom
2680,Yeezy,Yeezy,He said it himself this isn't the real Kanye s...,Instagram,{'Likes_and_timestamp': '0 likes on 2024-02-27...,0.0,,1,0,said n't real kanye care imposter saying face_...
2681,Gucci,Gucci,😍😍😍,Instagram,{'Likes_and_timestamp': '3 likes on 2023-09-23...,1.0,1.0,1,1,smiling_face_with_heart eyes smiling_face_with...


In [17]:
print(annotated_data['Comment'].iloc[20])
print(annotated_data['Preprocessed Comment'].iloc[20])

WHY is Hermes even getting involved at the Lotus casino, seems like a damn waste of time – tho I know they're probably trying to give Luke more backstory before the finale
hermes even getting involved lotus casino, seems like damn waste time though know 're probably trying give luke backstory finale


# RoBERTa

## Subjectivity Detection

In [None]:
annotated_texts = annotated_data['Comment'].tolist()
annotated_labels = annotated_data['Polarity'].tolist()

# Tokenize texts using BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
encoded_data = tokenizer(annotated_texts, padding=True, truncation=True, return_tensors='pt')

# Extract attention masks
attention_masks = encoded_data['attention_mask']

# Convert labels to tensor
labels = torch.tensor(annotated_labels)

# Split the annotated data into train and validation sets
train_texts, val_texts, train_labels, val_labels, train_masks, val_masks = train_test_split(encoded_data['input_ids'], labels, attention_masks, test_size=0.3, random_state=42)

# Define DataLoader for training and validation sets
train_dataset = TensorDataset(train_texts, train_masks, train_labels)
val_dataset = TensorDataset(val_texts, val_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [3]:
# Initialize BERT model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.to(device)
summary(model)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
├─RobertaClassificationHead: 1-2         --
|    └─Linear: 2-3                       590,592
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       1,538
Total params: 124,647,170
Trainable params: 124,647,170
Non-trainable params: 0


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
├─RobertaClassificationHead: 1-2         --
|    └─Linear: 2-3                       590,592
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       1,538
Total params: 124,647,170
Trainable params: 124,647,170
Non-trainable params: 0

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

best_val_loss = float('inf')  # Initialize with positive infinity
best_val_accuracy = 0.0
best_epoch = 0
patience = 5  # Number of epochs to wait for improvement

no_improvement_count = 0

start_time = time.time()

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)  # No need to pass labels here
        logits = outputs.logits

        labels = labels.long()
        
        loss = F.cross_entropy(logits, labels)  # Compute cross-entropy loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)  # No need to pass labels during evaluation
            logits = outputs.logits

            labels = labels.long()

            val_loss += F.cross_entropy(logits, labels).item()
            
            val_preds.extend(torch.argmax(logits, dim=1).tolist())
            val_targets.extend(labels.tolist())
    
    val_loss /= len(val_loader)
    val_accuracy = sum(1 for p, t in zip(val_preds, val_targets) if p == t) / len(val_preds)

    precision = precision_score(val_targets, val_preds)
    recall = recall_score(val_targets, val_preds)
    f1 = f1_score(val_targets, val_preds)
    
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}, Val Precision: {precision}, Val Recall: {recall}, Val F1: {f1}")

    # Update best validation loss and accuracy
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_accuracy = val_accuracy
        best_val_precision = precision
        best_val_recall = recall
        best_val_f1 = f1

        best_epoch_loss = epoch + 1
        no_improvement_count = 0

        best_model_path = 'models/best_roberta_adamw_subjectivity2.pth'
        torch.save(model.state_dict(), best_model_path)
    else:
        no_improvement_count += 1
    
    if no_improvement_count >= patience:
        print(f"No improvement for {patience} epochs. Early stopping...")
        break

end_time = time.time()

print(f"Best Validation Loss: {best_val_loss} at Epoch {best_epoch_loss}")
print(f"Best Validation Accuracy: {best_val_accuracy} at Epoch {best_epoch_loss}")
print(f"Best Validation Precision: {best_val_precision} at Epoch {best_epoch_loss}")
print(f"Best Validation Recall: {best_val_recall} at Epoch {best_epoch_loss}")
print(f"Best Validation F1: {best_val_f1} at Epoch {best_epoch_loss}")

print(f"Time taken to train the model: {end_time - start_time:.2f} seconds")


- Best Validation Loss: 0.5012361325469672 at Epoch 3
- Best Validation Accuracy: 0.7763975155279503 at Epoch 3
- Best Validation Precision: 0.8590078328981723 at Epoch 3
- Best Validation Recall: 0.7230769230769231 at Epoch 3
- Best Validation F1: 0.7852028639618138 at Epoch 3
- Time taken to train the model: 35304.90 seconds

## Polarity Detection

In [18]:
# Preprocess the annotated data (assuming it has columns 'text' and 'polarity')
annotated_polarity_data = annotated_data[annotated_data['Subjectivity']==1]

annotated_texts = annotated_polarity_data['Comment'].tolist()
annotated_labels = annotated_polarity_data['Polarity'].tolist()

# Tokenize texts using BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
encoded_data = tokenizer(annotated_texts, padding=True, truncation=True, return_tensors='pt')

# Extract attention masks
attention_masks = encoded_data['attention_mask']

# Convert labels to tensor
labels = torch.tensor(annotated_labels)

# Split the annotated data into train and validation sets
train_texts, val_texts, train_labels, val_labels, train_masks, val_masks = train_test_split(encoded_data['input_ids'], labels, attention_masks, test_size=0.3, random_state=42)

# Define DataLoader for training and validation sets
train_dataset = TensorDataset(train_texts, train_masks, train_labels)
val_dataset = TensorDataset(val_texts, val_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize BERT model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

best_val_loss = float('inf')  # Initialize with positive infinity
best_val_accuracy = 0.0
best_epoch = 0
patience = 5  # Number of epochs to wait for improvement

no_improvement_count = 0

start_time = time.time()

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)  # No need to pass labels here
        logits = outputs.logits

        labels = labels.long()
        
        loss = F.cross_entropy(logits, labels)  # Compute cross-entropy loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)  # No need to pass labels during evaluation
            logits = outputs.logits

            labels = labels.long()

            val_loss += F.cross_entropy(logits, labels).item()
            
            val_preds.extend(torch.argmax(logits, dim=1).tolist())
            val_targets.extend(labels.tolist())
    
    val_loss /= len(val_loader)
    val_accuracy = sum(1 for p, t in zip(val_preds, val_targets) if p == t) / len(val_preds)

    precision = precision_score(val_targets, val_preds)
    recall = recall_score(val_targets, val_preds)
    f1 = f1_score(val_targets, val_preds)
    
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}, Val Precision: {precision}, Val Recall: {recall}, Val F1: {f1}")

    # Update best validation loss and accuracy
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_accuracy = val_accuracy
        best_val_precision = precision
        best_val_recall = recall
        best_val_f1 = f1

        best_epoch_loss = epoch + 1
        no_improvement_count = 0

        best_model_path = 'models/best_roberta_adamw_polarity2.pth'
        torch.save(model.state_dict(), best_model_path)
    else:
        no_improvement_count += 1
    
    if no_improvement_count >= patience:
        print(f"No improvement for {patience} epochs. Early stopping...")
        break

end_time = time.time()

print(f"Best Validation Loss: {best_val_loss} at Epoch {best_epoch_loss}")
print(f"Best Validation Accuracy: {best_val_accuracy} at Epoch {best_epoch_loss}")
print(f"Best Validation Precision: {best_val_precision} at Epoch {best_epoch_loss}")
print(f"Best Validation Recall: {best_val_recall} at Epoch {best_epoch_loss}")
print(f"Best Validation F1: {best_val_f1} at Epoch {best_epoch_loss}")

print(f"Time taken to train the model: {end_time - start_time:.2f} seconds")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 69/69 [38:17<00:00, 33.29s/it] 


Epoch 1: Train Loss: 45.723883748054504, Val Loss: 0.662416026989619, Val Accuracy: 0.5662393162393162, Val Precision: 0.5577342047930284, Val Recall: 1.0, Val F1: 0.7160839160839161


Epoch 2: 100%|██████████| 69/69 [37:17<00:00, 32.43s/it]


Epoch 2: Train Loss: 42.73330116271973, Val Loss: 0.5174851556619008, Val Accuracy: 0.7735042735042735, Val Precision: 0.7906976744186046, Val Recall: 0.796875, Val F1: 0.7937743190661478


Epoch 3: 100%|██████████| 69/69 [37:21<00:00, 32.48s/it]


Epoch 3: Train Loss: 31.557389616966248, Val Loss: 0.4945437297224998, Val Accuracy: 0.811965811965812, Val Precision: 0.7781456953642384, Val Recall: 0.91796875, Val F1: 0.8422939068100358


Epoch 4: 100%|██████████| 69/69 [37:19<00:00, 32.46s/it]


Epoch 4: Train Loss: 24.202057898044586, Val Loss: 0.47943010727564495, Val Accuracy: 0.7692307692307693, Val Precision: 0.8303571428571429, Val Recall: 0.7265625, Val F1: 0.775


Epoch 5: 100%|██████████| 69/69 [37:20<00:00, 32.48s/it]


Epoch 5: Train Loss: 22.492990363389254, Val Loss: 0.49837560604015985, Val Accuracy: 0.7692307692307693, Val Precision: 0.7283950617283951, Val Recall: 0.921875, Val F1: 0.8137931034482758


Epoch 6: 100%|██████████| 69/69 [37:18<00:00, 32.45s/it]


Epoch 6: Train Loss: 18.950306314975023, Val Loss: 0.5018869072198868, Val Accuracy: 0.8012820512820513, Val Precision: 0.8146718146718147, Val Recall: 0.82421875, Val F1: 0.8194174757281554


Epoch 7: 100%|██████████| 69/69 [37:16<00:00, 32.42s/it]


Epoch 7: Train Loss: 20.494622353464365, Val Loss: 0.5270366872350375, Val Accuracy: 0.7692307692307693, Val Precision: 0.743421052631579, Val Recall: 0.8828125, Val F1: 0.8071428571428572


Epoch 8: 100%|██████████| 69/69 [37:17<00:00, 32.42s/it]


Epoch 8: Train Loss: 10.931127285584807, Val Loss: 0.8575857601128518, Val Accuracy: 0.7414529914529915, Val Precision: 0.8857142857142857, Val Recall: 0.60546875, Val F1: 0.7192575406032483


Epoch 9: 100%|██████████| 69/69 [37:18<00:00, 32.45s/it]


Epoch 9: Train Loss: 9.367206977214664, Val Loss: 0.6195476113508145, Val Accuracy: 0.8076923076923077, Val Precision: 0.8051470588235294, Val Recall: 0.85546875, Val F1: 0.8295454545454546
No improvement for 5 epochs. Early stopping...
Best Validation Loss: 0.47943010727564495 at Epoch 4
Best Validation Accuracy: 0.7692307692307693 at Epoch 4
Best Validation Precision: 0.8303571428571429 at Epoch 4
Best Validation Recall: 0.7265625 at Epoch 4
Best Validation F1: 0.775 at Epoch 4
Time taken to train the model: 21695.17 seconds
