In [28]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [2]:
# Load data to panda data frame
rows=[]
with open(os.path.join('dontpatronizeme_pcl.tsv')) as f:
    for line in f.readlines()[4:]:
        par_id=line.strip().split('\t')[0]
        art_id = line.strip().split('\t')[1]
        keyword=line.strip().split('\t')[2]
        country=line.strip().split('\t')[3]
        t=line.strip().split('\t')[4]#.lower()
        l=line.strip().split('\t')[-1]
        if l=='0' or l=='1':
            lbin=0
        else:
            lbin=1
        rows.append(
            {'par_id':par_id,
            'art_id':art_id,
            'keyword':keyword,
            'country':country,
            'text':t, 
            'label':lbin, 
            'orig_label':l
            }
            )
df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label']) 
print(df.shape[0])
print(df[df['label'] == 0].shape[0])
print(df[df['label'] == 1].shape[0])


10469
9476
993


In [3]:

# Filter rows with a specific label (e.g., 'negative')
noPCL = df[df['label'] == 0]
PCL = df[df['label'] == 1]
average = df

# Get sentence length (in words)
noPCL['text_Length'] = noPCL['text'].apply(lambda x: len(x.split()))
PCL['text_Length'] = PCL['text'].apply(lambda x: len(x.split()))
average['text_length']  = df['text'].apply(lambda x: len(x.split()))

# Calculate average length
average_length_noPCL = noPCL['text_Length'].mean()
average_length_PCL = PCL['text_Length'].mean()
average_l = average['text_length'].mean()

print(average_length_noPCL)
print(average_length_PCL)
print(average_l)


47.87779653862389
53.620342396777446
48.42248543318369


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noPCL['text_Length'] = noPCL['text'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PCL['text_Length'] = PCL['text'].apply(lambda x: len(x.split()))


In [4]:
# Get unique values from a column
total = df[df['keyword']=='vulnerable'].shape[0]
unique_keywords_noPCL = noPCL[noPCL['keyword']=='vulnerable'].shape[0]
unique_keywords_PCL = PCL[PCL['keyword']=='vulnerable'].shape[0]

print((unique_keywords_noPCL/total)*(993))
print((unique_keywords_PCL/total)*(9476))


919.4444444444445
701.9259259259259


In [5]:
# Split in to dev set and test set
train_ids = pd.read_csv('train_semeval_parids-labels.csv')
dev_ids = pd.read_csv('dev_semeval_parids-labels.csv')
train_ids = train_ids.iloc[:, 0].astype(str).tolist()
dev_ids = dev_ids.iloc[:, 0].astype(str).tolist()

train_df = df[df['par_id'].isin(train_ids)]
dev_df = df[df['par_id'].isin(dev_ids)]
print(len(train_df)+len(dev_df))


10469


In [None]:
class PCLDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        # Get tokenized text
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Get label
        item['labels'] = torch.tensor(self.labels[idx])
        
        # Suppose extra_features is a precomputed list of feature vectors:
        extra_feature = torch.tensor(self.extra_features[idx])
        item['extra_features'] = extra_feature
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
# Load pretrained tokenizer and model
model_name = "roberta-base"  # You can switch to "roberta-large" if desired
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#train_texts = train_df["text"].tolist()
#train_labels = train_df["label"].tolist()

# Split train set into train and valiadation (90/10)
#train_texts, val_texts, train_labels, val_labels = train_test_split(
#    train_texts, train_labels, test_size=0.1, random_state=42
#)

# Create dataset objects for training and validation
#train_dataset = PCLDataset(train_texts, train_labels, tokenizer, max_length=128)
#val_dataset = PCLDataset(val_texts, val_labels, tokenizer, max_length=128)

In [17]:
import nlpaug.augmenter.word as naw
from sklearn.model_selection import train_test_split

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()

# Separate majority and minority classes
minority_texts = [train_texts[i] for i in range(len(train_labels)) if train_labels[i] == 1]
minority_labels = [1] * len(minority_texts)

# Data augmentation with Contextual BERT substitution
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",  # You can also use "insert"
    device='cuda'  # or 'cpu'
)

augmented_texts = []
for text in minority_texts:
    augmented_texts.extend([aug.augment(text) for _ in range(8)])  # 8 augmentations per sample

#augmented_texts = [aug.augment(text) for text in minority_texts]
augmented_labels = [1] * len(augmented_texts)

# Combine augmented minority samples with the original data
train_texts_balanced = train_texts + augmented_texts
train_labels_balanced = train_labels + augmented_labels

# Ensure train_texts_balanced is a flat list, not nested
train_texts_balanced = [text if isinstance(text, str) else text[0] for text in train_texts_balanced]


# Split into training and validation sets (90/10)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts_balanced, train_labels_balanced, test_size=0.1, random_state=42
)



# Create dataset objects for training and validation
train_dataset = PCLDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = PCLDataset(val_texts, val_labels, tokenizer, max_length=128)

print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

Train samples: 13254, Validation samples: 1473


In [18]:
print("Class distribution after augmentation:")
print(f"Class 0: {train_labels_balanced.count(0)} instances")
print(f"Class 1: {train_labels_balanced.count(1)} instances")

Class distribution after augmentation:
Class 0: 7581 instances
Class 1: 7146 instances


In [19]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',            # Output directory for model checkpoints
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=8,     # Batch size per device during training
    per_device_eval_batch_size=16,     # Batch size for evaluation
    warmup_steps=500,                  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # Strength of weight decay
    logging_dir='./logs',              # Directory for storing logs
    logging_steps=10,                  # Log every 10 steps
    evaluation_strategy="steps",       # Evaluate every 'eval_steps'
    eval_steps=100,                    # Evaluate every 100 steps
    save_steps=100,                    # Save checkpoint every 100 steps
    load_best_model_at_end=True,       # Load the best model at the end
    metric_for_best_model="eval_loss",
    disable_tqdm=False,
    report_to=[]
)



In [20]:
# Initialize and start the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss
100,0.3574,0.225592
200,0.1312,0.333111
300,0.2868,0.190876
400,0.343,0.203376
500,0.3538,0.213682
600,0.0995,0.241924
700,0.1522,0.245704
800,0.2583,0.189348
900,0.321,0.277011
1000,0.1419,0.219028


TrainOutput(global_step=4971, training_loss=0.21338168493530305, metrics={'train_runtime': 2650.9833, 'train_samples_per_second': 14.999, 'train_steps_per_second': 1.875, 'total_flos': 2615455445806080.0, 'train_loss': 0.21338168493530305, 'epoch': 3.0})

In [22]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 0.18934808671474457, 'eval_runtime': 15.1699, 'eval_samples_per_second': 97.1, 'eval_steps_per_second': 6.131, 'epoch': 3.0}


In [23]:
# Save the fine-tuned model and tokenizer
save_directory = "./fine_tuned_pcl_model"
trainer.save_model(save_directory)  # This saves the model to the specified directory
tokenizer.save_pretrained(save_directory)
print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./fine_tuned_pcl_model


In [24]:
# Load the saved model and tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [25]:
# Load the final test set
rows=[]
with open(os.path.join('task4_test.tsv')) as f:
    for line in f.readlines():
        par_id=line.strip().split('\t')[0]
        art_id = line.strip().split('\t')[1]
        keyword=line.strip().split('\t')[2]
        country=line.strip().split('\t')[3]
        rows.append(
            {'par_id':par_id,
            'art_id':art_id,
            'keyword':keyword,
            'country':country,
            'text':t, 
            }
            )
test_df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text']) 
print(test_df.shape[0])

3832


In [26]:
test_texts = test_df["text"].tolist()
#test_dataset = PCLDataset(test_texts, tokenizer, max_length=128)

dev_texts = dev_df["text"].tolist()
dev_labels = dev_df["label"].tolist()

dev_dataset = PCLDataset(dev_texts, dev_labels, tokenizer, max_length=128)

In [27]:
# Use the loaded model to make predictions
from sklearn.metrics import f1_score

loaded_model.eval()
correct = 0
predictions = []
for i in range(len(dev_texts)):
    text = dev_texts[i]
    encodings = loaded_tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")
    loaded_model.eval()
    with torch.no_grad():
        outputs = loaded_model(**encodings)
        pred_label = torch.argmax(outputs.logits, dim=1)
        if pred_label.tolist()[0] == dev_labels[i]:
            correct += 1
        #print("Predictions:", predictions.tolist()[0], "Actual label:", dev_labels[i])
        predictions.append(pred_label.tolist()[0])
print(correct)
accuracy = correct/len(dev_texts)
f1 = f1_score(dev_labels, predictions, average='binary')
print("Accuracy:", accuracy, "F1:", f1)

1897
Accuracy: 0.9059216809933143 F1: 0.029556650246305417
