In [None]:
import torch 
print(torch.cuda.is_available())
print(torch.__version__)

In [None]:
# !git clone https://github.com/BernardMoy/NLP-PCL-Classification.git

In [None]:
# %cd NLP-PCL-Classification/

In [None]:
!nvidia-smi

# Load train and validation data set

In [None]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
from collections import Counter

df = pd.read_csv('data/dontpatronizeme_pcl.tsv', sep='\t')

# Remove rows with NA labels 
df = df.dropna() 

# Add a bool_labels column for binary classification
df["bool_labels"] = df["label"] > 1   # is PCL if >1

# train val split 
train_labels = pd.read_csv('data/train_semeval_parids-labels.csv')["par_id"]
val_labels = pd.read_csv('data/dev_semeval_parids-labels.csv')["par_id"]
df_train = df[df["par_id"].isin(train_labels)]
df_val = df[df["par_id"].isin(val_labels)]


# Perform coreference resolution

In [None]:
from fastcoref import FCoref 

# define the model once 
model = FCoref(device='cuda:0') 

def coreference_resolution(model, text): 
    preds = model.predict(
        texts = [text]
    )

    clusters = preds[0].get_clusters(as_strings = False) 

    # create mappings from pronoun indices -> entities TEXT
    d = {} 
    for cluster in clusters: 
        entity, pronoun = cluster 
        d[pronoun] = text[entity[0]:entity[1]]
    
    # sort the clusters by the start position of the pronoun indices reversed 
    sorted_clusters = sorted(clusters, key = lambda x: x[0], reverse = True) 

    for cluster in sorted_clusters: 
        _, pronoun = cluster
        start = pronoun[0] 
        end = pronoun[1] 
        
        # replace the pronoun with the entity TEXT from the d, starting in the reverse order 
        text = text[:start] + d[pronoun] + text[end:]
    
    return text

        
    

print(coreference_resolution(model, 'We are so happy to see you using our coref package. This package is very fast!'))
print(coreference_resolution(model, "Dr. Lester Keith , doctor and professor of business administration , and others are checking with local transportation groups to see if they can bring those in need of a meal to the college for the 4 p.m. dinner . We will also be contacting local soup kitchens as a pickup location and will work with them to transport any leftovers to them so there is no wasted food , Dr. Keith said ."))


# Tokenization

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 

# Create text with contextual information 
def tokenize(df): 
    text_with_context = df["text"] # df["keyword"] + "</s><s>" + df["country_code"] + "</s><s>" + df["text"]

    encoding = tokenizer(
        text_with_context.tolist(), 
        padding="max_length",   # Add padding to shorter sentences 
        max_length=256,
        truncation = True, 
        return_attention_mask = True 
    )

    return encoding

# Convert to pyTorch dataset

In [None]:
import torch 
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset

def to_dataset(df): 
    # Obtain tokens (input_ids, attention_mask) from the dataset 
    encoding = tokenize(df) 

    # Return huggingface dataset 
    return Dataset.from_dict({
        "input_ids": encoding["input_ids"], 
        "attention_mask": encoding["attention_mask"], 
        "label": df["bool_labels"].values 
    })

In [None]:
train_dataset = to_dataset(df_train)
val_dataset = to_dataset(df_val) 

# set to torch format 
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Training 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate metrics 
    accuracy = accuracy_score(labels, predictions) 
    precision = precision_score(labels, predictions) 
    recall = recall_score(labels, predictions) 
    f1 = f1_score(labels, predictions) 

    return {
        "accuracy": accuracy, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1 
    }


In [None]:
# Load roberta sequence classification model 
config = AutoConfig.from_pretrained("roberta-base", num_labels=2)  # Binary classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config = config)

BATCH_SIZE = 32

# Set up training arguments 
training_args = TrainingArguments(
    fp16=True, 
    num_train_epochs=5, 
    learning_rate=2e-5, 
    weight_decay=0.01,
    warmup_steps=500, 
    save_strategy="epoch", 
    load_best_model_at_end=True, 
    logging_steps=50,
    output_dir="./predictions", 
    eval_strategy="epoch", 
    per_device_eval_batch_size=BATCH_SIZE, 
    per_device_train_batch_size=BATCH_SIZE, 
)

# Set up trainer 
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    compute_metrics=compute_metrics
)


In [None]:
trainer.train() 

In [None]:
from fastcoref import FCoref 

sent = "Poor children might find more obstacles in their race to a worthy future ."

model = FCoref(device='cuda:0') 
preds = model.predict(
    texts = [sent]
)

preds


In [None]:
trainer.evaluate()