In [None]:
# Install necessary libraries
!pip install torch torchtext transformers

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [None]:


# Load IMDb dataset
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz



--2024-11-19 07:02:51--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2024-11-19 07:03:18 (3.07 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



# Part 1: Fine tuning a pre-trained BERT

This code is fine-tuning a pre-trained BERT model on the IMDb dataset for sentiment analysis. It starts with a BERT model that is already pre-trained on a large text corpus (from bert-base-uncased), which has learned general language representations. The code loads this pre-trained model and then trains it further on the IMDb dataset specifically for sentiment analysis (a binary classification task). This process adjusts the weights of the model to better predict positive or negative sentiment based on the IMDb data.

Training BERT from scratch would mean initializing a BERT model with random weights and training it on a very large corpus (like Wikipedia or BooksCorpus) to learn language representations, which is computationally intensive. Fine-tuning, on the other hand, starts from a pre-trained model and requires much less data and computation. So, this code fine-tunes the model rather than training from scratch. All layers of BERT are trainable during fine-tuning, it doesn’t freeze the pre-trained layers. This means that the weights of the entire BERT model (not just the final classification layer) are updated during training on the IMDb dataset.

In [None]:
# Import libraries and download the data

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, BertForQuestionAnswering, AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from transformers import pipeline

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Training function
def train_model(model, train_loader, optimizer, scheduler, epochs=3):
    model.train()
    for epoch in range(epochs):

        # Listing trainable parameters
        trainable_params = list(filter(lambda p: p.requires_grad, model.parameters()))
        print(f"Number of trainable parameters: {len(trainable_params)}")

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        print(f'Epoch {epoch + 1}/{epochs} Loss: {loss.item()}')

# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    correct_predictions = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    accuracy = correct_predictions.double() / len(test_loader.dataset)
    print(f'Accuracy: {accuracy:.4f}')


def load_imdb_data(path):
    data = []
    for label in ['pos', 'neg']:
        dir_path = f"{path}/{label}"
        for file in os.listdir(dir_path):
            with open(f"{dir_path}/{file}", 'r', encoding='utf-8') as f:
                text = f.read()
                data.append((1 if label == 'pos' else 0, text))
    return pd.DataFrame(data, columns=['label', 'text'])


# Data Preparation - Tokenizing and padding
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, text = self.data.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


train_data = load_imdb_data('aclImdb/train')
test_data = load_imdb_data('aclImdb/test')

# Tokenizer and pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)


# Create DataLoaders
BATCH_SIZE = 8
train_dataset = IMDBDataset(train_data, tokenizer)
test_dataset = IMDBDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Training setup
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * EPOCHS  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



# Training the model
train_model(model, train_loader, optimizer, scheduler, EPOCHS)


# Evaluating the model
evaluate_model(model, test_loader)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 201
Epoch 1/3 Loss: 0.3474839925765991
Number of trainable parameters: 201
Epoch 2/3 Loss: 0.7089992165565491
Number of trainable parameters: 201
Epoch 3/3 Loss: 0.835295557975769
Accuracy: 0.9160


# Part 2: Gradual unfreezing


Gradual unfreezing is a training strategy used in transfer learning. The idea is to fine-tune a pretrained model by initially freezing most of its layers, training the top (or newly added) layers, and progressively unfreezing more layers as training progresses. This allows the model to first learn the new task in a stable manner before updating the weights of the earlier, more general layers.

When you load a pretrained model (like BERT), it has many layers, often with millions of parameters. These pretrained weights are optimized for a general task (e.g., language modeling in BERT). Initially, you freeze all layers except for the task-specific classifier (e.g., the final linear layer) by setting requires_grad=False for those layers. This ensures only the classifier is updated during the initial training phase.


As training progresses, you progressively unfreeze earlier layers in the model, allowing their weights to be fine-tuned. This gradual unfreezing avoids destabilizing the model by ensuring that earlier, more general layers are only updated after the task-specific layers have been refined.

* Each epoch (or after a certain number of epochs), you unfreeze one or more layers.
* This can be done sequentially, layer by layer, or in groups (e.g., unfreezing an entire block of layers).


In [None]:
# Gradual unfreezing for sentiment analysis and question answering with BERT on IMDb

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, BertForQuestionAnswering, AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_imdb_data(path):
    data = []
    for label in ['pos', 'neg']:
        dir_path = f"{path}/{label}"
        for file in os.listdir(dir_path):
            with open(f"{dir_path}/{file}", 'r', encoding='utf-8') as f:
                text = f.read()
                data.append((1 if label == 'pos' else 0, text))
    return pd.DataFrame(data, columns=['label', 'text'])


# Freeze all layers initially
for param in model.parameters():
    param.requires_grad = False

# keep the classifier layer trainable
for name, param in model.named_parameters():
    if "classifier" in name:
        param.requires_grad = True


# Unfreeze layers gradually
def unfreeze_layers(model, epoch):
    # Gradually unfreeze one encoder layer per epoch, starting from the last layer
    if epoch < len(model.bert.encoder.layer):
        for param in model.bert.encoder.layer[-(epoch + 1)].parameters():
            param.requires_grad = True

# Training function
def train_model(model, train_loader, optimizer, scheduler, epochs=3):
    model.train()
    for epoch in range(epochs):
        unfreeze_layers(model, epoch)  # Gradual unfreezing

        # Debugging trainable parameters
        trainable_params = list(filter(lambda p: p.requires_grad, model.parameters()))
        print(f"Number of trainable parameters: {len(trainable_params)}")

        # Define optimizer with trainable parameters
        optimizer = AdamW(trainable_params, lr=2e-5)

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        print(f'Epoch {epoch + 1}/{epochs} Loss: {loss.item()}')


# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    correct_predictions = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    accuracy = correct_predictions.double() / len(test_loader.dataset)
    print(f'Accuracy: {accuracy:.4f}')


# Data Preparation - Tokenizing and padding
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, text = self.data.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


train_data = load_imdb_data('aclImdb/train')
test_data = load_imdb_data('aclImdb/test')

# Initialize the BERT tokenizer and model for sentiment analysis
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)


# Create DataLoaders
BATCH_SIZE = 8
train_dataset = IMDBDataset(train_data, tokenizer)
test_dataset = IMDBDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Training setup
EPOCHS = 3
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * EPOCHS  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



# Train the model
train_model(model, train_loader, optimizer, scheduler, EPOCHS)

# Evaluate the model
evaluate_model(model, test_loader)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 201
Epoch 1/3 Loss: 0.481783926486969
Number of trainable parameters: 201
Epoch 2/3 Loss: 0.014164907857775688
Number of trainable parameters: 201


# Part 3: Question answering

In [None]:
import torch
import random
import os
import tarfile
from transformers import BertTokenizer, BertForSequenceClassification, BertForQuestionAnswering
from torch.utils.data import DataLoader, Dataset

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and models
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentiment_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
qa_model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad").to(device)

# Prepare data: extract 5 random reviews and labels
def load_reviews(path, label):
    reviews = []
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), "r", encoding="utf-8") as f:
            reviews.append((f.read(), label))
    return reviews

# Function to ask questions about sentiment
def ask_bert_review(text, question):
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt").to(device)
    answer_start_scores, answer_end_scores = qa_model(**inputs, return_dict=False)

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer


# Load positive and negative reviews
positive_reviews = load_reviews("aclImdb/test/pos", label=1)
negative_reviews = load_reviews("aclImdb/test/neg", label=0)


# Combine and sample reviews
all_reviews = positive_reviews + negative_reviews
random.shuffle(all_reviews)
sample_reviews = random.sample(all_reviews, 5)

# Analyze reviews
results = []

for review, label in sample_reviews:
    # Sentiment classification using pretrained bert - not fine tuned
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    sentiment_output = sentiment_model(**inputs)
    sentiment = torch.argmax(sentiment_output.logits, dim=1).item()
    sentiment_label = "positive" if sentiment == 1 else "negative"
    actual_label = "positive" if label == 1 else "negative"


    # Store results
    results.append({
        "review": review,
        "predicted_sentiment": sentiment_label,
        "actual_sentiment": actual_label,
        "reason": ask_bert_review(review, f"Why is this review {sentiment_label}?")
    })

# Print results
for i, result in enumerate(results):
    print(f"Review {i + 1}:")
    print(f"Text: {result['review'][:300]}...")  # Truncate for display
    print(f"Predicted Sentiment: {result['predicted_sentiment']}")
    print(f"Actual Sentiment: {result['actual_sentiment']}")
    print(f"Reason: {result['reason']}")
    print("\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Review 1:
Text: By 1971 it was becoming more and more obvious that Hammer film studios were on the way out . HANDS OF THE RIPPER is a case in point where even the idea smacks of desperation - The spirit of Jack The Ripper posses his own daughter ! Yeah okay no one was expecting a documentary but this plot seems to ...
Predicted Sentiment: positive
Actual Sentiment: negative
Reason: 


Review 2:
Text: I'm one of those gluttons for punishment when it comes to sitcoms these days-I still will check them out every once in a while.My observation is that most of them aren't very funny even the ones on major networks that are getting high ratings ,I just don't get who is finding them gut busting funny. ...
Predicted Sentiment: positive
Actual Sentiment: negative
Reason: [CLS] why is this review positive ? [SEP]


Review 3:
Text: Ernesto is a man that makes a living out of duping other solid citizens of their hard earned money. Together with Manco, an older man with a lot of experience, he pull

# TO-DO



---

## Parts 1 and 2

* **Experiment with Different Pretrained Models**:
   - Try other transformer-based models like `distilbert-base-uncased` or `roberta-base`. Compare their performance with `bert-base-uncased`.

* **Adjust Hyperparameters**:
   - Experiment with learning rates (e.g., `1e-5`, `5e-5`, `2e-6`) and batch sizes to observe their impact on model performance.
   - Evaluate how changing the number of epochs affects validation accuracy and loss.


* **Comparative Analysis**:
   - Compare the performance of fine-tuning with gradual unfreezing to fine-tuning with all layers unfrozen from the start.
   - Discuss the computational trade-offs (time vs. accuracy) and performance differences.


---

## Part 3

* **Manual Review**:
   - Manually analyze a few QA outputs to assess whether the model’s reasoning aligns with human judgment. Highlight discrepancies and discuss their causes.

* **Fine-tuned BERT**:
   - Replace the simple pretrained BERT with your best fine-tuned model and/or replace BERT with other transformer-based model for Q&A

* **Prompt Engineering**:
   - Experiment with different phrasing for the question (e.g., "What makes this review positive/negative?" or "Explain the sentiment of this review."). Discuss how the phrasing affects the model's answers.

* **Entity-Specific Analysis**:
   - Modify the question to focus on specific entities or aspects in the review (e.g., "What does the review say about the acting?" or "Why is the plot criticized?"). Discuss the model’s ability to handle nuanced questions.
