# BERT

Author: Alikhan Semembayev

## 1. Perform necessary data preprocessing, e.g. removing punctuation and stop words, stemming, lemmatizing. You may use the outputs from previous weekly assignments. (10 points)

In [None]:
from collections import defaultdict
import demoji
import svgling
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from autocorrect import Speller
import re

# Initialize tools
spell = Speller()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

email_re = r"\b[A-Za-z]+@\S+\b"
ssn_re = r"\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b"
ip_re = r"\b\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}\b"

street_number_re = r"^\d{1,}"
street_name_re = r"[a-zA-Z0-9\s]+,?"
city_name_re = r" [a-zA-Z]+(\,)?"
state_abbrev_re = r" [A-Z]{2}"
postal_code_re = r" [0-9]{5}$"
address_pattern_re = r"" + street_number_re + street_name_re + city_name_re + state_abbrev_re + postal_code_re


def clean_text(text):
    # Replace emojis
    text = demoji.replace(text)

    # Remove smart quotes and dashes
    text = text.replace("“", "\"").replace("”", "\"").replace("-", " ").replace("'", " ")

    # Lowercase text
    text = text.lower()

    # Tokenize text
    words = word_tokenize(text)
    # print(words)

    # Spelling correction + replace all t with not
    words = ['not' if word == 't' else (
        'ADDRESS' if re.match(address_pattern_re, word)
        else (
            'EMAIL' if re.match(email_re, word)
            else (
                'SSN' if re.match(ssn_re, word)
                else (
                    'IP' if re.match(ip_re, word)
                    else spell(word)
                )
            )
        )
    ) for word in words]

    # Remove stop words and non-alphabetic tokens and punctuation
    words = [word for word in words if word.isalnum() and word not in stop_words or word in ['not', 'no']]

    # POS tagging and Lemmatization
    tagged_words = pos_tag(words)

    tag_map = defaultdict(lambda: "n")
    tag_map["N"] = "n"
    tag_map["V"] = "v"
    tag_map["J"] = "a"
    tag_map["R"] = "r"

    words = [lemmatizer.lemmatize(word, pos=tag_map[tag[0]]) for word, tag in tagged_words]

    # Return cleaned words as a single string
    return ' '.join(words)

In [None]:
import pandas as pd

data = (pd.read_csv('../../../../data/text/combined_raw.csv'))
data = data.dropna(how='any')

for row in data.values:
    row[0] = clean_text(row[0])

data.to_csv('../../../../data/text/combined_cleaned.csv', index=False)

In [43]:
import pandas as pd

data = (pd.read_csv('../../../../data/text/combined_cleaned.csv'))
data = data.dropna(how='any')

print(data.head(10))

                                                text    emotion
0  freshwater fish drink water skin via osmosis s...      happy
1  think everyone must use daily become grained e...    neutral
2  agree google headquarters mountain view califo...    neutral
3  thats funny current ceo sunday ficha didnt kno...    neutral
4  oh yeah not know either also want go google al...  surprised
5                                                say  surprised
6        yeah apparently lol instead hire people row      happy
7  thats funny guess imaginative leave huge tech ...  surprised
8  yeah exactly sure cheap one thing bet not expl...  surprised
9  remember hearing immortality waste jellyfish h...    neutral


## 2. For the binary classification problem you came up previously, build your own model by combining BERT with a classifier.  (30 points)

In [45]:
import pandas as pd

# Set the maximum rows per label
max_rows_per_label = 10000

# Sample rows for each label
balanced_data = data.groupby('emotion', group_keys=False).apply(lambda x: x.sample(n=min(len(x), max_rows_per_label)))

# Save or use the balanced data
balanced_data.to_csv("../../../../data/text/combined_cleaned_balanced_dataset.csv", index=False)

In [62]:
import pandas as pd

data = (pd.read_csv('../../../../data/text/combined_cleaned_multilabel.csv'))
data = data.dropna(how='any')

print(data.head(10))

                                                text  is_happy  is_surprised  \
0  freshwater fish drink water skin via osmosis s...         1             0   
1  think everyone must use daily become grained e...         0             0   
2  agree google headquarters mountain view califo...         0             0   
3  thats funny current ceo sunday ficha didnt kno...         0             0   
4  oh yeah not know either also want go google al...         0             1   
5                                                say         0             1   
6        yeah apparently lol instead hire people row         1             0   
7  thats funny guess imaginative leave huge tech ...         0             1   
8  yeah exactly sure cheap one thing bet not expl...         0             1   
9  remember hearing immortality waste jellyfish h...         0             0   

   is_neutral  is_sad  is_fear  is_angry  is_disgust  
0           0       0        0         0           0  
1        

In [74]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from transformers import BertModel

# Custom Dataset Class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.texts[idx],
            # 'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

class EmotionClassifier(nn.Module):
    def __init__(self, num_classes):
        super(EmotionClassifier, self).__init__()
        
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
        
        # Freeze BERT parameters
        for param in self.bert.bert.parameters():
            param.requires_grad = False
        
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits

In [75]:
# Preprocess labels
label_encoder = LabelEncoder()
data['label'] = data['is_happy']
num_classes = 2

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 50

def tokenize_texts(texts):
    encodings = tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings['input_ids'].tolist()

# Convert texts to token IDs
data['input_ids'] = tokenize_texts(data['text'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    data['input_ids'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42
)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [76]:
# Create PyTorch datasets
train_dataset = EmotionDataset(X_train, y_train)
test_dataset = EmotionDataset(X_test, y_test)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

## 3. Train your own model by fine-tuning BERT. And save your model and use it to classify sentences (50 points)

In [77]:
# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize model, loss, and optimizer
model = EmotionClassifier(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch in enumerate(train_loader, start=1):
        optimizer.zero_grad()
        
        # Move data to device
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0 or batch_idx == len(train_loader):
            print(f"Batch {batch_idx}/{len(train_loader)}: Loss = {loss.item()}")
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch 10/922: Loss = 0.5630530714988708
Batch 20/922: Loss = 0.6131772994995117
Batch 30/922: Loss = 0.6327952146530151
Batch 40/922: Loss = 0.5326545834541321
Batch 50/922: Loss = 0.6016024947166443
Batch 60/922: Loss = 0.6733378767967224
Batch 70/922: Loss = 0.5222554802894592
Batch 80/922: Loss = 0.6361863613128662
Batch 90/922: Loss = 0.6165667772293091
Batch 100/922: Loss = 0.6207923889160156
Batch 110/922: Loss = 0.6651461124420166
Batch 120/922: Loss = 0.5742948651313782
Batch 130/922: Loss = 0.607204258441925
Batch 140/922: Loss = 0.6520662903785706
Batch 150/922: Loss = 0.6315717697143555
Batch 160/922: Loss = 0.5946286916732788
Batch 170/922: Loss = 0.6228736639022827
Batch 180/922: Loss = 0.6310967206954956
Batch 190/922: Loss = 0.6357423067092896
Batch 200/922: Loss = 0.6597950458526611
Batch 210/922: Loss = 0.6262190341949463
Batch 220/922: Loss = 0.6458306312561035
Batch 230/922: Loss = 0.5219413042068481
Batch 240/922: Loss = 0.5974372625350952
Batch 250/922: Loss = 0.56

In [79]:
# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        # Move data to device
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        
        # Move predictions and labels back to CPU for evaluation
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

# # Classification Report
# print("Classification Report:")
# # print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division="warn"))
# print("Accuracy:", accuracy_score(all_labels, all_preds))

from sklearn.metrics import classification_report

print("Classification Report:")
print("Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=['Not happy', 'Happy']))


Classification Report:
Accuracy: 0.7088139503324739
              precision    recall  f1-score   support

   Not happy       0.71      1.00      0.83     20896
       Happy       0.46      0.00      0.00      8580

    accuracy                           0.71     29476
   macro avg       0.58      0.50      0.42     29476
weighted avg       0.64      0.71      0.59     29476



In [80]:
# Define a path to save the model
model_save_path = "emotion_classifier_model.pth"

# Save the model state dictionary
torch.save(model.state_dict(), model_save_path)

In [81]:
# Instantiate a new model instance
loaded_model = EmotionClassifier(num_classes).to(device)

# Load the saved state dictionary
loaded_model.load_state_dict(torch.load(model_save_path, map_location=device))

# Set the model to evaluation mode if you’re planning to evaluate or make predictions
loaded_model.eval()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EmotionClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_feature

In [82]:
# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        # Move data to device
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        
        # Move predictions and labels back to CPU for evaluation
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        
from sklearn.metrics import classification_report

print("Classification Report:")
print("Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=['Not happy', 'Happy']))

Classification Report:
Accuracy: 0.7088139503324739
              precision    recall  f1-score   support

   Not happy       0.71      1.00      0.83     20896
       Happy       0.46      0.00      0.00      8580

    accuracy                           0.71     29476
   macro avg       0.58      0.50      0.42     29476
weighted avg       0.64      0.71      0.59     29476



## 4. Summarize what you have learned and discovered from Task 1-3. (10 points)

1. Preprocessing is very important stage of development. Using stemming and lemmatization, along with removing stop words, helps improve text data representation. It decreases the size of dataset by removing unnecessary information, and optimizes it for training.
2. Bart can be used as a layer of MLP. It has its own dropout. But we added another dropout layer. Also, we removed all other fully connected layers.
3. We have very big amount of data. To save time we used high learning rate and small number of epochs for training. THe accuracy of the model predictions is 71%. 
4. Binary classification has better accuracy, because of fewer labels(possible outputs). Multicalss problem is more difficult and requires more time.
5. It is possible to save the model with Bert as one of layers and load it for later use.