In [3]:
#!/usr/bin/env python
# coding: utf-8

"""
Logistic Regression + Bag-of-Words (BoW) Example
-----------------------------------------------
This script demonstrates:
1) How to load the 'dontpatronizeme_pcl.tsv' data using the same methods (get_train, get_test)
   as in the DeBERTa code.
2) How to use scikit-learn's CountVectorizer to convert text into BoW features.
3) How to train and evaluate a Logistic Regression classifier on the task of classifying paragraphs
   into two classes: label=0 (non-patronizing) vs. label=1 (patronizing).
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import torch

class DontPatronizeMe:
    """
    A simple class that reads the 'dontpatronizeme_pcl.tsv' file.
    It extracts paragraphs and converts original labels {0,1,2,3,4}
    into a binary label (0 = non-patronizing, 1 = patronizing).
    """

    def __init__(self, _train_path, _test_path):
        self.train_path = _train_path
        self.test_path = _test_path
        self.train_task1_df = None

    def load_task1(self):
        """
        Reads the tsv file starting from the 5th line (skipping headers).
        Each line is split by tab into multiple fields.
        If orig_label is 0 or 1, we set label=0; otherwise (2,3,4) => label=1.
        """
        rows = []
        with open(self.train_path) as f:
            # Skip the first 4 lines which contain only description
            for line in f.readlines()[4:]:
                fields = line.strip().split('\t')
                par_id = fields[0]
                art_id = fields[1]
                keyword = fields[2]
                country = fields[3]
                text_ = fields[4]
                orig_label = fields[-1]

                # Convert original label to binary
                if orig_label in ['0','1']:
                    lbin = 0
                else:
                    lbin = 1

                rows.append({
                    'par_id': par_id,
                    'art_id': art_id,
                    'keyword': keyword,
                    'country': country,
                    'text': text_,
                    'label': lbin,
                    'orig_label': orig_label
                })
        df = pd.DataFrame(
            rows, 
            columns=['par_id','art_id','keyword','country','text','label','orig_label']
        )
        self.train_task1_df = df

def get_test(user):
    """
    Loads the entire dataset from 'dontpatronizeme_pcl.tsv' via DontPatronizeMe,
    then filters to keep only paragraphs in 'dev_semeval_parids-labels.csv',
    effectively creating a Dev/Test set for evaluation.
    """
    _train_path = f'{user}/cw/dontpatronizeme_pcl.tsv'
    _test_path = f'{user}/cw/task4_test.tsv'
    
    dpm = DontPatronizeMe(_train_path, _test_path)
    dpm.load_task1()
    
    train_data = dpm.train_task1_df
    train_data["par_id"] = train_data["par_id"].astype(str)
    
    # Read the dev set par_ids
    dev_parids = pd.read_csv("dev_semeval_parids-labels.csv")
    dev_parids["par_id"] = dev_parids["par_id"].astype(str)
    dev_parid_list = dev_parids["par_id"].unique()
    
    # Filter only paragraphs with par_id in dev set
    dev_data = train_data[train_data["par_id"].isin(dev_parid_list)]
    return dev_data

def get_train(user):
    """
    Similar to get_test, but keeps only paragraphs in 'train_semeval_parids-labels.csv',
    producing the training set.
    """
    _train_path = f'{user}/cw/dontpatronizeme_pcl.tsv'
    _test_path = f'{user}/cw/task4_test.tsv'
    
    dpm = DontPatronizeMe(_train_path, _test_path)
    dpm.load_task1()
    
    train_data = dpm.train_task1_df
    train_data["par_id"] = train_data["par_id"].astype(str)
    
    train_parids = pd.read_csv("train_semeval_parids-labels.csv")
    train_parids["par_id"] = train_parids["par_id"].astype(str)
    train_parid_list = train_parids["par_id"].unique()
    
    # Filter the main data to keep only paragraphs in the train split
    train_filtered_data = train_data[train_data["par_id"].isin(train_parid_list)]
    return train_filtered_data

def main():
    """
    Main workflow:
    1) Load the train/dev data from the same dataset used by the DeBERTa example.
    2) Split train_data into train_train_data and train_val_data.
    3) Use CountVectorizer to create bag-of-words (BoW) features.
    4) Train a Logistic Regression classifier.
    5) Evaluate on both validation set and dev (test) set.
    """

    # Adjust 'user' path accordingly for your environment
    user = "/vol/bitbucket/cx720/cw/nlp/70016-Natural-Language-Processing/"
    
    # 1) Load train and test (dev) data
    train_data = get_train(user)   # entire train set
    test_data  = get_test(user)    # dev set

    # 2) Split train_data into train/val
    train_train_data, train_val_data = train_test_split(
        train_data, 
        test_size=0.2, 
        random_state=42, 
        stratify=train_data['label']
    )

    # 3) Fit CountVectorizer on training set
    vectorizer = CountVectorizer()
    vectorizer.fit(train_train_data["text"])

    # Transform text into BoW features
    def transform_text(df):
        # Transform returns a sparse matrix; we convert it to a dense array
        return vectorizer.transform(df['text']).toarray()

    X_train = transform_text(train_train_data)
    y_train = train_train_data["label"].values

    X_val = transform_text(train_val_data)
    y_val = train_val_data["label"].values

    X_test = transform_text(test_data)
    y_test = test_data["label"].values

    # 4) Train a Logistic Regression model on the training set
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    # Evaluate on the validation set
    val_preds = clf.predict(X_val)
    val_acc = accuracy_score(y_val, val_preds)
    val_f1  = f1_score(y_val, val_preds)

    print("=== Validation Set ===")
    print(f"Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")

    # 5) Evaluate on the test (dev) set
    test_preds = clf.predict(X_test)
    test_acc = accuracy_score(y_test, test_preds)
    test_f1 = f1_score(y_test, test_preds)

    print("=== Test Set ===")
    print(f"Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}")

if __name__ == "__main__":
    main()


=== Validation Set ===
Accuracy: 0.9110, F1: 0.3134
=== Test Set ===
Accuracy: 0.8968, F1: 0.2286
