In [20]:
#!/usr/bin/env python
# coding: utf-8

"""
BoW + MLP Example with pos_weight Adjustment for Imbalanced Data
----------------------------------------------------------------
1) Reads the 'dontpatronizeme_pcl.tsv' dataset using DontPatronizeMe class.
2) Splits the data into train/validation/test sets (the dev set is treated as a test set).
3) Uses CountVectorizer to convert paragraphs into Bag-of-Words (BoW) feature vectors.
4) Defines a simple MLP (two-layer feed-forward network) in PyTorch.
5) Computes pos_weight for BCEWithLogitsLoss to account for class imbalance.
6) Trains the MLP, evaluates it on validation and finally on test (dev) set.

Please ensure you have installed:
- pandas, numpy, scikit-learn
- PyTorch
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class DontPatronizeMe:
    """
    A simple data-loading class that reads 'dontpatronizeme_pcl.tsv' and 
    converts original labels (0,1,2,3,4) into binary (0/1).
    """

    def __init__(self, _train_path, _test_path):
        self.train_path = _train_path
        self.test_path = _test_path
        self.train_task1_df = None

    def load_task1(self):
        """
        Reads the TSV file starting from the 5th line, 
        splits each line by tab, and assigns a binary label:
        - '0' or '1' => label=0
        - '2', '3', or '4' => label=1
        Stores the result in self.train_task1_df.
        """
        rows=[]
        with open(self.train_path) as f:
            # Skip the first 4 lines, which are just descriptions
            for line in f.readlines()[4:]:
                fields = line.strip().split('\t')
                par_id = fields[0]
                art_id = fields[1]
                keyword = fields[2]
                country = fields[3]
                text_ = fields[4]
                orig_label = fields[-1]
                
                # Convert original label to binary
                if orig_label in ['0','1']:
                    lbin = 0
                else:
                    lbin = 1
                rows.append({
                    'par_id': par_id,
                    'art_id': art_id,
                    'keyword': keyword,
                    'country': country,
                    'text': text_,
                    'label': lbin,
                    'orig_label': orig_label
                })
        df = pd.DataFrame(rows, columns=['par_id','art_id','keyword','country','text','label','orig_label'])
        self.train_task1_df = df

def get_test(user):
    """
    Loads the entire dataset via DontPatronizeMe, then filters by 'dev_semeval_parids-labels.csv' 
    to extract the dev set, which we treat as the test set here.
    """
    _train_path = f'{user}/cw/dontpatronizeme_pcl.tsv'
    _test_path = f'{user}/cw/task4_test.tsv'
    
    dpm = DontPatronizeMe(_train_path, _test_path)
    dpm.load_task1()
    
    train_data = dpm.train_task1_df
    train_data["par_id"] = train_data["par_id"].astype(str)
    
    dev_parids = pd.read_csv("dev_semeval_parids-labels.csv")
    dev_parids["par_id"] = dev_parids["par_id"].astype(str)
    dev_parid_list = dev_parids["par_id"].unique()
    
    dev_data = train_data[train_data["par_id"].isin(dev_parid_list)]
    return dev_data

def get_train(user):
    """
    Loads the dataset via DontPatronizeMe, then filters by 'train_semeval_parids-labels.csv'
    to extract the train set.
    """
    _train_path = f'{user}/cw/dontpatronizeme_pcl.tsv'
    _test_path = f'{user}/cw/task4_test.tsv'
    
    dpm = DontPatronizeMe(_train_path, _test_path)
    dpm.load_task1()
    
    train_data = dpm.train_task1_df
    train_data["par_id"] = train_data["par_id"].astype(str)
    
    train_parids = pd.read_csv("train_semeval_parids-labels.csv")
    train_parids["par_id"] = train_parids["par_id"].astype(str)
    train_parid_list = train_parids["par_id"].unique()
    
    train_filtered_data = train_data[train_data["par_id"].isin(train_parid_list)]
    return train_filtered_data

class MLP(nn.Module):
    """
    A simple 2-layer feed-forward network (MLP) for binary classification:
    [Linear -> ReLU -> Linear -> output logits]
    """

    def __init__(self, input_dim, hidden_dim=64):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)  

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def main():
    """
    Main workflow:
    1) Load train data (and dev set) using the same splits as in the DeBERTa code.
    2) Perform an 80-20 train-validation split on the train set.
    3) Transform text to BoW features (CountVectorizer).
    4) Define the MLP model in PyTorch and compute pos_weight for BCEWithLogitsLoss
       to handle class imbalance.
    5) Train for a fixed number of epochs (e.g., 30).
    6) Evaluate on both validation set and final test (dev) set, 
       printing Accuracy and F1 scores.
    """

    user = "/vol/bitbucket/cx720/cw/nlp/70016-Natural-Language-Processing/"
    
    # 1) Load train and test data
    train_data = get_train(user)
    test_data  = get_test(user)

    # 2) Split train_data into train and validation sets
    train_train_data, train_val_data = train_test_split(
        train_data, 
        test_size=0.2, 
        random_state=42, 
        stratify=train_data['label']
    )

    # 3) Create BoW features with CountVectorizer
    vectorizer = CountVectorizer()
    vectorizer.fit(train_train_data['text'])

    def bow_transform(df):
        # Convert sparse matrix to a dense NumPy array
        return vectorizer.transform(df['text']).toarray()

    X_train = bow_transform(train_train_data)
    y_train = train_train_data['label'].values

    X_val = bow_transform(train_val_data)
    y_val = train_val_data['label'].values

    X_test = bow_transform(test_data)
    y_test = test_data['label'].values

    # Convert to PyTorch tensors
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1,1)

    X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
    y_val_t   = torch.tensor(y_val,   dtype=torch.float32).view(-1,1)

    X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
    y_test_t  = torch.tensor(y_test,  dtype=torch.float32).view(-1,1)

    # 4) Define MLP model
    input_dim = X_train.shape[1]
    model = MLP(input_dim=input_dim, hidden_dim=64)

    # Calculate pos_weight to address class imbalance
    pos_count = (y_train == 1).sum()
    neg_count = (y_train == 0).sum()
    if pos_count == 0:
        # If no positive samples at all, fallback
        pos_weight_value = 1.0
    else:
        pos_weight_value = neg_count / pos_count
    print(f"Training samples: {len(y_train)} (pos={pos_count}, neg={neg_count}), "
          f"pos_weight={pos_weight_value:.2f}")

    # Use BCEWithLogitsLoss with pos_weight
    criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor([pos_weight_value], dtype=torch.float32)
    )
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # 5) Training loop
    epochs = 25
    for epoch in range(1, epochs+1):
        model.train()
        optimizer.zero_grad()

        # Forward pass
        logits = model(X_train_t)
        loss = criterion(logits, y_train_t)

        # Backprop
        loss.backward()
        optimizer.step()

        # Evaluate on the training set
        with torch.no_grad():
            train_preds = (torch.sigmoid(logits) >= 0.5).float()
            train_acc = (train_preds == y_train_t).float().mean().item()

        # Evaluate on validation set
        model.eval()
        with torch.no_grad():
            val_logits = model(X_val_t)
            val_loss = criterion(val_logits, y_val_t)
            val_preds = (torch.sigmoid(val_logits) >= 0.5).float()
            val_acc = (val_preds == y_val_t).float().mean().item()

        print(f"Epoch {epoch}/{epochs} | "
              f"Train Loss: {loss.item():.4f}, Train Acc: {train_acc*100:.2f}% | "
              f"Val Loss: {val_loss.item():.4f}, Val Acc: {val_acc*100:.2f}%")

    # 6) Final evaluation on the test (dev) set
    model.eval()
    with torch.no_grad():
        test_logits = model(X_test_t)
        test_probs = torch.sigmoid(test_logits).view(-1).cpu().numpy()
        test_preds = (test_probs >= 0.5).astype(int)
        y_test_1d  = y_test_t.view(-1).cpu().numpy()

    test_acc = accuracy_score(y_test_1d, test_preds)
    test_f1  = f1_score(y_test_1d, test_preds)
    print("\n=== Test Set Evaluation ===")
    print(f"Test Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")

if __name__ == "__main__":
    main()


Training samples: 6700 (pos=635, neg=6065), pos_weight=9.55
Epoch 1/25 | Train Loss: 1.2553, Train Acc: 90.48% | Val Loss: 1.2485, Val Acc: 81.37%
Epoch 2/25 | Train Loss: 1.2372, Train Acc: 89.16% | Val Loss: 1.2385, Val Acc: 77.25%
Epoch 3/25 | Train Loss: 1.2140, Train Acc: 86.13% | Val Loss: 1.2274, Val Acc: 81.25%
Epoch 4/25 | Train Loss: 1.1856, Train Acc: 87.96% | Val Loss: 1.2166, Val Acc: 82.81%
Epoch 5/25 | Train Loss: 1.1558, Train Acc: 89.54% | Val Loss: 1.2058, Val Acc: 82.99%
Epoch 6/25 | Train Loss: 1.1252, Train Acc: 89.85% | Val Loss: 1.1943, Val Acc: 82.03%
Epoch 7/25 | Train Loss: 1.0936, Train Acc: 89.22% | Val Loss: 1.1827, Val Acc: 80.96%
Epoch 8/25 | Train Loss: 1.0615, Train Acc: 88.64% | Val Loss: 1.1713, Val Acc: 80.42%
Epoch 9/25 | Train Loss: 1.0293, Train Acc: 88.45% | Val Loss: 1.1607, Val Acc: 80.54%
Epoch 10/25 | Train Loss: 0.9973, Train Acc: 88.45% | Val Loss: 1.1511, Val Acc: 80.90%
Epoch 11/25 | Train Loss: 0.9656, Train Acc: 88.99% | Val Loss: 1.142