In [1]:
import pandas as pd
import os
import tqdm as tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Define environment-specific paths
WORKING_ENV = 'local'  # Change as needed

if WORKING_ENV == 'cluster':
    content_path = '/vol/bitbucket/pvr24/nlp_cw_et1224_pvr24/'
    data_path = f'{content_path}data'
    cache_dir = f'{content_path}huggingface_cache'  # Define a cache directory on Bitbucket

elif WORKING_ENV == 'local':
    content_path = './'
    data_path = './data/'
    cache_dir = './huggingface_cache'
    os.makedirs(data_path, exist_ok=True)

else:
    raise NotImplementedError()

# Ensure cache directory exists
os.makedirs(cache_dir, exist_ok=True)

# Set Hugging Face cache directory
os.environ["HF_HOME"] = cache_dir

In [6]:
from transformers import DistilBertTokenizer, DistilBertModel
# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", cache_dir=cache_dir)
model = DistilBertModel.from_pretrained("distilbert-base-uncased", cache_dir=cache_dir)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [3]:
def load_task1(tsv_path):
    """
    Load the full dataset from the TSV file, skipping the disclaimer header.
    Extracts 'par_id', 'text', and assigns binary labels:
    - Labels 0 or 1 -> Negative (0)
    - Labels 2, 3, or 4 -> Positive (1)
    """
    rows = []
    with open(tsv_path, encoding='utf-8') as f:
        lines = f.readlines()[7:]  # Skip disclaimer lines
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) < 5:
                continue  # Skip malformed lines
            par_id, text, label = int(parts[0]), parts[4], int(parts[-1])
            lbin = 0 if label in {0, 1} else 1
            rows.append({'par_id': par_id, 'text': text, 'label': lbin})
    return pd.DataFrame(rows)

def load_splits(train_csv, val_csv):
    train_df = pd.read_csv(train_csv)
    val_df = pd.read_csv(val_csv)
    return set(train_df['par_id']), set(val_df['par_id'])

def split_data(full_df, train_ids, val_ids):
    train_df = full_df[full_df['par_id'].isin(train_ids)][['text', 'label']].reset_index(drop=True)
    val_df = full_df[full_df['par_id'].isin(val_ids)][['text', 'label']].reset_index(drop=True)
    return train_df, val_df


In [4]:
tsv_path = data_path + '/dontpatronizeme_pcl.tsv'
train_csv = data_path + '/train_semeval_parids-labels.csv'
val_csv = data_path + '/dev_semeval_parids-labels.csv'

full_df = load_task1(tsv_path)

train_ids, val_ids = load_splits(train_csv, val_csv)

train_df, val_df = split_data(full_df, train_ids, val_ids)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

def train_naive_bayes(train_df, max_features=5000):
    """
    Train a Naïve Bayes classifier using TF-IDF features from the given DataFrame.
    """
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
    X_train = vectorizer.fit_transform(train_df['text'])
    model = MultinomialNB()
    model.fit(X_train, train_df['label'])
    return model, vectorizer

def evaluate_naive_bayes(model, vectorizer, val_df):
    """
    Evaluate Naïve Bayes classifier on validation DataFrame.
    """
    X_val = vectorizer.transform(val_df['text'])
    preds = model.predict(X_val)
    accuracy = accuracy_score(val_df['label'], preds)
    f1 = f1_score(val_df['label'], preds, pos_label=1)
    return accuracy, f1

In [6]:
maxf1 = -1
best_max_features = 0
best_model = None
best_vectorizer = None
for max_features in range(1,1001, 25):
    model, vectorizer = train_naive_bayes(train_df, max_features=max_features)
    accuracy, f1 = evaluate_naive_bayes(model, vectorizer, val_df)
    if f1 > maxf1:
        best_model = model
        best_vectorizer = vectorizer
        maxf1 = f1
        best_max_features = max_features

for max_features in range(max(1, best_max_features-25), best_max_features+25):
    model, vectorizer = train_naive_bayes(train_df, max_features=max_features)
    accuracy, f1 = evaluate_naive_bayes(model, vectorizer, val_df)
    if f1 > maxf1:
        best_model = model
        best_vectorizer = vectorizer
        maxf1 = f1
        best_max_features = max_features
print(f'Best max_features: {best_max_features} with F1: {maxf1}')
accuracy, f1 = evaluate_naive_bayes(best_model, best_vectorizer, val_df)
print(f'Validation F1: {f1}')

Best max_features: 751 with F1: 0.01990049751243781
Validation F1: 0.01990049751243781


In [7]:
best_model, best_vectorizer = train_naive_bayes(full_df, max_features=751)
accuracy, f1 = evaluate_naive_bayes(best_model, best_vectorizer, val_df)
print(f'Validation F1: {f1} and accuracy {accuracy}')

Validation F1: 0.0297029702970297 and accuracy 0.9063992359121299


In [8]:
# Get the feature names
feature_names = best_vectorizer.get_feature_names_out()

# Get log probabilities for each class (rows are classes, columns are words)
log_probs = best_model.feature_log_prob_

# If it's a binary classifier (0/1 labels), inspect words associated with class 1
class_index = 1  # Adjust based on your class labels
word_importance = sorted(zip(feature_names, log_probs[class_index]), key=lambda x: x[1], reverse=True)

# Print the most important words for class 1
for word, importance in word_importance[:20]:  # Top 20 words
    print(f"{word}: {importance}")

need: -4.359718396284812
people: -4.426272282013459
poor: -4.426592691173605
homeless: -4.451958668639371
families: -4.523391483693496
children: -4.584229906393141
help: -4.639374283094673
said: -4.6909599177469765
women: -4.903129600119044
disabled: -5.036705507003807
life: -5.059289670084718
hope: -5.104903461030114
hopeless: -5.107040921054615
refugees: -5.117035405959012
vulnerable: -5.2100570441668115
lives: -5.253165138430214
food: -5.302379744122222
poverty: -5.365066695829686
country: -5.401952870518165
world: -5.424014584172876


In [None]:
#highlight wrongly classified examples
X_val = best_vectorizer.transform(val_df['text'])
preds = best_model.predict(X_val)
val_df['pred'] = preds
#print(val_df[val_df['label'] == 0 && val_df['pred'] == 1].head(20))        
#print(val_df[(val_df['label'] == 1) & (val_df['pred'] == 0)].head(20))
#print examples with highest probability of being positive
probs = best_model.predict_proba(X_val)
val_df['prob'] = probs[:,1]
print(val_df.sort_values(by='prob', ascending=False).head(20))

                                                 text  label  pred
0   "His present "" chambers "" may be quite humbl...      1     0
1   Krueger recently harnessed that creativity to ...      1     0
2   10:41am - Parents of children who died must ge...      1     0
3   When some people feel causing problem for some...      1     0
4   We are alarmed to learn of your recently circu...      1     0
5   """ We share a global responsibility to respon...      1     0
6   The former Chelsea star through his foundation...      1     0
7   It can not be right to allow homes to sit empt...      1     0
8   """ People do n't understand the hurt , people...      1     0
9   He depicts demonstrations by refugees at the b...      1     0
10  18 December should serve as a time when we loo...      1     0
11  Almost apocalyptic in its devastation , the wr...      1     0
12  It 's calculated that over 204,000 days of pur...      1     0
13  """ I and my daughter Monica are excited about...      1  

In [12]:
def get_embedding(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    
    embedding = outputs.last_hidden_state[:, 0, :]
    return embedding.squeeze(0).cpu()

In [14]:
# Directory for saving embeddings
embeddings_dir = 'embeddings'

# Ensure the embeddings directory exists
os.makedirs(embeddings_dir, exist_ok=True)

# Check if embeddings are already saved
embeddings_exist = False
try:
    # Try to load existing embeddings from the embeddings folder
    train_embeddings = torch.load(os.path.join(embeddings_dir, 'train_embeddings.pt'))
    val_embeddings = torch.load(os.path.join(embeddings_dir, 'val_embeddings.pt'))
    train_labels = torch.load(os.path.join(embeddings_dir, 'train_labels.pt'))
    val_labels = torch.load(os.path.join(embeddings_dir, 'val_labels.pt'))
    embeddings_exist = True
except FileNotFoundError:
    # Embeddings do not exist, calculate them
    train_embeddings = [get_embedding(text) for text in train_df["text"]]
    train_labels = torch.tensor(train_df["label"].tolist(), dtype=torch.long)

    val_embeddings = [get_embedding(text) for text in val_df["text"]]
    val_labels = torch.tensor(val_df["label"].tolist(), dtype=torch.long)

    # Save the embeddings for future use in the embeddings folder
    torch.save(train_embeddings, os.path.join(embeddings_dir, 'train_embeddings.pt'))
    torch.save(val_embeddings, os.path.join(embeddings_dir, 'val_embeddings.pt'))
    torch.save(train_labels, os.path.join(embeddings_dir, 'train_labels.pt'))
    torch.save(val_labels, os.path.join(embeddings_dir, 'val_labels.pt'))

# Create TensorDatasets
train_dataset = TensorDataset(torch.stack(train_embeddings), train_labels)
val_dataset = TensorDataset(torch.stack(val_embeddings), val_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [24]:
class FFN(nn.Module):
    def __init__(self, input_dim = 768, hidden_dim = 128):
        super(FFN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.act = nn.LeakyReLU()
        self.drop = nn.Dropout(0.5)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        return x 

In [25]:
model = FFN(hidden_dim = 512).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
num_epochs = 100  # Set number of epochs
model.train()  # Set the model to training mode

# Training Loop with tqdm progress bar
for epoch in range(num_epochs):
    epoch_loss = 0
    # Wrap the train_loader with tqdm for a progress bar
    for embeddings, labels in tqdm.tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False):
        embeddings, labels = embeddings.to(device), labels.float().to(device)  # Move to GPU if available
        
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(embeddings)  # Forward pass
        loss = criterion(outputs.squeeze(), labels)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        epoch_loss += loss.item()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}')

                                                                                                                        

Epoch [1/100], Loss: 0.2768


                                                                                                                        

Epoch [2/100], Loss: 0.2465


                                                                                                                        

Epoch [3/100], Loss: 0.2423


                                                                                                                        

Epoch [4/100], Loss: 0.2388


                                                                                                                        

Epoch [5/100], Loss: 0.2334


                                                                                                                        

Epoch [6/100], Loss: 0.2330


                                                                                                                        

Epoch [7/100], Loss: 0.2290


                                                                                                                        

Epoch [8/100], Loss: 0.2255


                                                                                                                        

Epoch [9/100], Loss: 0.2247


                                                                                                                        

Epoch [10/100], Loss: 0.2238


                                                                                                                        

Epoch [11/100], Loss: 0.2182


                                                                                                                        

Epoch [12/100], Loss: 0.2159


                                                                                                                        

Epoch [13/100], Loss: 0.2151


                                                                                                                        

Epoch [14/100], Loss: 0.2121


                                                                                                                        

Epoch [15/100], Loss: 0.2123


                                                                                                                        

Epoch [16/100], Loss: 0.2066


                                                                                                                        

Epoch [17/100], Loss: 0.2074


                                                                                                                        

Epoch [18/100], Loss: 0.2037


                                                                                                                        

Epoch [19/100], Loss: 0.1997


                                                                                                                        

Epoch [20/100], Loss: 0.1976


                                                                                                                        

Epoch [21/100], Loss: 0.1966


                                                                                                                        

Epoch [22/100], Loss: 0.1942


                                                                                                                        

Epoch [23/100], Loss: 0.1884


                                                                                                                        

Epoch [24/100], Loss: 0.1857


                                                                                                                        

Epoch [25/100], Loss: 0.1872


                                                                                                                        

Epoch [26/100], Loss: 0.1815


                                                                                                                        

Epoch [27/100], Loss: 0.1807


                                                                                                                        

Epoch [28/100], Loss: 0.1816


                                                                                                                        

Epoch [29/100], Loss: 0.1762


                                                                                                                        

Epoch [30/100], Loss: 0.1716


                                                                                                                        

Epoch [31/100], Loss: 0.1676


                                                                                                                        

Epoch [32/100], Loss: 0.1667


                                                                                                                        

Epoch [33/100], Loss: 0.1618


                                                                                                                        

Epoch [34/100], Loss: 0.1615


                                                                                                                        

Epoch [35/100], Loss: 0.1530


                                                                                                                        

Epoch [36/100], Loss: 0.1543


                                                                                                                        

Epoch [37/100], Loss: 0.1514


                                                                                                                        

Epoch [38/100], Loss: 0.1436


                                                                                                                        

Epoch [39/100], Loss: 0.1439


                                                                                                                        

Epoch [40/100], Loss: 0.1430


                                                                                                                        

Epoch [41/100], Loss: 0.1352


                                                                                                                        

Epoch [42/100], Loss: 0.1358


                                                                                                                        

Epoch [43/100], Loss: 0.1294


                                                                                                                        

Epoch [44/100], Loss: 0.1285


                                                                                                                        

Epoch [45/100], Loss: 0.1255


                                                                                                                        

Epoch [46/100], Loss: 0.1259


                                                                                                                        

Epoch [47/100], Loss: 0.1241


                                                                                                                        

Epoch [48/100], Loss: 0.1172


                                                                                                                        

Epoch [49/100], Loss: 0.1133


                                                                                                                        

Epoch [50/100], Loss: 0.1118


                                                                                                                        

Epoch [51/100], Loss: 0.1084


                                                                                                                        

Epoch [52/100], Loss: 0.1065


                                                                                                                        

Epoch [53/100], Loss: 0.0975


                                                                                                                        

Epoch [54/100], Loss: 0.0934


                                                                                                                        

Epoch [55/100], Loss: 0.0961


                                                                                                                        

Epoch [56/100], Loss: 0.0942


                                                                                                                        

Epoch [57/100], Loss: 0.0930


                                                                                                                        

Epoch [58/100], Loss: 0.0961


                                                                                                                        

Epoch [59/100], Loss: 0.0913


                                                                                                                        

Epoch [60/100], Loss: 0.0858


                                                                                                                        

Epoch [61/100], Loss: 0.0851


                                                                                                                        

Epoch [62/100], Loss: 0.0855


                                                                                                                        

Epoch [63/100], Loss: 0.0817


                                                                                                                        

Epoch [64/100], Loss: 0.0811


                                                                                                                        

Epoch [65/100], Loss: 0.0749


                                                                                                                        

Epoch [66/100], Loss: 0.0760


                                                                                                                        

Epoch [67/100], Loss: 0.0716


                                                                                                                        

Epoch [68/100], Loss: 0.0688


                                                                                                                        

Epoch [69/100], Loss: 0.0693


                                                                                                                        

Epoch [70/100], Loss: 0.0624


                                                                                                                        

Epoch [71/100], Loss: 0.0674


                                                                                                                        

Epoch [72/100], Loss: 0.0634


                                                                                                                        

Epoch [73/100], Loss: 0.0679


                                                                                                                        

Epoch [74/100], Loss: 0.0648


                                                                                                                        

Epoch [75/100], Loss: 0.0633


                                                                                                                        

Epoch [76/100], Loss: 0.0605


                                                                                                                        

Epoch [77/100], Loss: 0.0611


                                                                                                                        

Epoch [78/100], Loss: 0.0560


                                                                                                                        

Epoch [79/100], Loss: 0.0586


                                                                                                                        

Epoch [80/100], Loss: 0.0556


                                                                                                                        

Epoch [81/100], Loss: 0.0530


                                                                                                                        

Epoch [82/100], Loss: 0.0539


                                                                                                                        

Epoch [83/100], Loss: 0.0537


                                                                                                                        

Epoch [84/100], Loss: 0.0521


                                                                                                                        

Epoch [85/100], Loss: 0.0525


                                                                                                                        

Epoch [86/100], Loss: 0.0521


                                                                                                                        

Epoch [87/100], Loss: 0.0531


                                                                                                                        

Epoch [88/100], Loss: 0.0404


                                                                                                                        

Epoch [89/100], Loss: 0.0479


                                                                                                                        

Epoch [90/100], Loss: 0.0508


                                                                                                                        

Epoch [91/100], Loss: 0.0451


                                                                                                                        

Epoch [92/100], Loss: 0.0440


                                                                                                                        

Epoch [93/100], Loss: 0.0451


                                                                                                                        

Epoch [94/100], Loss: 0.0453


                                                                                                                        

Epoch [95/100], Loss: 0.0489


                                                                                                                        

Epoch [96/100], Loss: 0.0477


                                                                                                                        

Epoch [97/100], Loss: 0.0441


                                                                                                                        

Epoch [98/100], Loss: 0.0375


                                                                                                                        

Epoch [99/100], Loss: 0.0421


                                                                                                                        

Epoch [100/100], Loss: 0.0388




In [27]:
model.eval()  # Set the model to evaluation mode
all_preds = []
all_labels = []

# Evaluation Loop with tqdm progress bar
with torch.no_grad():
    for embeddings, labels in tqdm.tqdm(val_loader, desc='Evaluating', leave=False):
        embeddings, labels = embeddings.to(device), labels.float().to(device)
        outputs = model(embeddings)
        predicted = (outputs.squeeze() > 0.5).float()  # Apply threshold for binary classification
        
        all_preds.extend(predicted.cpu().numpy())  # Store predictions
        all_labels.extend(labels.cpu().numpy())  # Store true labels

# Calculate accuracy
accuracy = (torch.tensor(all_preds) == torch.tensor(all_labels)).float().mean().item()
print(f'Validation Accuracy: {accuracy:.4f}')

# Calculate F1 score
f1 = f1_score(all_labels, all_preds)
print(f'Validation F1 Score: {f1:.4f}')

                                                                                                                        

Validation Accuracy: 0.9069
Validation F1 Score: 0.3564


