# Task 2: Subjectivity Classification

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [7]:
# Load datasets
train_data = pd.read_csv("train_en.tsv", sep="\t")
test_data = pd.read_csv("test_en_gold.tsv", sep="\t")

# Inspect the data
print(train_data.head())
print(test_data.head())
print(train_data.columns)


                            sentence_id  \
0  b9e1635a-72aa-467f-86d6-f56ef09f62c3   
1  f99b5143-70d2-494a-a2f5-c68f10d09d0a   
2  4076639c-aa56-4202-ae0f-9d9217f8da68   
3  b057c366-698e-419d-a284-9b16d835c64e   
4  a5a9645e-7850-41ba-90a2-5def725cd5b8   

                                            sentence label  solved_conflict  
0  Gone are the days when they led the world in r...  SUBJ             True  
1  The trend is expected to reverse as soon as ne...   OBJ            False  
2             But there is the specious point again.   OBJ            False  
3  He added he wouldn’t be surprised to see a new...   OBJ            False  
4  Not less government, you see; the same amount ...  SUBJ            False  
                            sentence_id  \
0  8745d4da-91c9-4538-acee-b0e7b1c413fd   
1  43de04ad-d0ac-4852-9b4e-cf0bca066188   
2  e00b66ee-720a-47e3-a0fb-0e2445b89af6   
3  0b95d635-f821-45dd-9f33-b05d63629195   
4  5ba3117b-3ef9-4815-acb4-a263d3c816bc   

              

In [16]:
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(data, tokenizer, max_len=128):
    inputs = tokenizer(
        list(data["sentence"]),
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    labels = torch.tensor(data["label"].values)
    return inputs, labels

# Preprocess training and testing data
train_inputs, train_labels = preprocess_data(train_data, tokenizer)
test_inputs, test_labels = preprocess_data(test_data, tokenizer)


In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_data["label"] = label_encoder.fit_transform(train_data["label"])
test_data["label"] = label_encoder.transform(test_data["label"])


In [14]:
print(train_data.isnull().sum())
print(test_data.isnull().sum())


sentence_id        0
sentence           0
label              0
solved_conflict    0
dtype: int64
sentence_id    0
sentence       0
label          0
dtype: int64


In [15]:
from torch.utils.data import Dataset

class SubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.texts = data["sentence"].values
        self.labels = data["label"].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }


In [17]:
from torch.utils.data import DataLoader

# Create Dataset objects
train_dataset = SubjectivityDataset(train_data, tokenizer, max_len=128)
test_dataset = SubjectivityDataset(test_data, tokenizer, max_len=128)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [18]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)


In [24]:
from sklearn.metrics import accuracy_score

# Training Loop with Accuracy Evaluation
epochs = 5
for epoch in range(epochs):
    lstm_model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = lstm_model(input_ids)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Store predictions and labels for accuracy calculation
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    epoch_accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")


Epoch 1/5, Loss: 34.1264, Accuracy: 0.6410
Epoch 2/5, Loss: 34.0484, Accuracy: 0.6410
Epoch 3/5, Loss: 34.0725, Accuracy: 0.6410
Epoch 4/5, Loss: 34.0623, Accuracy: 0.6410
Epoch 5/5, Loss: 33.9617, Accuracy: 0.6410


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

lstm_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        outputs = lstm_model(input_ids)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate Metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="macro")
recall = recall_score(all_labels, all_preds, average="macro")
f1 = f1_score(all_labels, all_preds, average="macro")

# Print Metrics in a Clear Format
print("="*40)
print("LSTM Model Performance Metrics:")
print("="*40)
print(f"{'Metric':<15} {'Score':<10}")
print("-"*40)
print(f"{'Accuracy':<15} {accuracy:.4f}")
print(f"{'Precision':<15} {precision:.4f}")
print(f"{'Recall':<15} {recall:.4f}")
print(f"{'F1-Score':<15} {f1:.4f}")
print("="*40)


LSTM Model Performance Metrics:
Metric          Score     
----------------------------------------
Accuracy        0.4774
Precision       0.2387
Recall          0.5000
F1-Score        0.3231


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Transfer Learning with Pretrained Transformers**

In [26]:
from transformers import BertForSequenceClassification

bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import AdamW
from transformers import get_scheduler

# Define Optimizer
optimizer = AdamW(bert_model.parameters(), lr=2e-5)

# Define Learning Rate Scheduler
num_training_steps = len(train_loader) * epochs
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [34]:
from sklearn.metrics import accuracy_score

# Training loop
epochs = 5
bert_model.train()

for epoch in range(epochs):
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in train_loader:
        optimizer.zero_grad()

        # Move data to the GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Evaluate accuracy on the test/validation set after the epoch
    bert_model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = bert_model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")

    # Switch back to training mode
    bert_model.train()


Epoch 1/5, Loss: 0.3809, Accuracy: 0.7325
Epoch 2/5, Loss: 0.4171, Accuracy: 0.7325
Epoch 3/5, Loss: 0.4158, Accuracy: 0.7325
Epoch 4/5, Loss: 0.3599, Accuracy: 0.7325
Epoch 5/5, Loss: 0.3990, Accuracy: 0.7325


In [33]:
bert_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate Metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="macro")
recall = recall_score(all_labels, all_preds, average="macro")
f1 = f1_score(all_labels, all_preds, average="macro")

print(f"BERT Model Performance:")
print(f"Accuracy: {accuracy:.4f} \nPrecision: {precision:.4f} \nRecall: {recall:.4f} \nF1-Score: {f1:.4f}")


BERT Model Performance:
Accuracy: 0.7325 
Precision: 0.7584 
Recall: 0.7392 
F1-Score: 0.7289
