In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd ./gdrive/MyDrive/DOD\ Dataset

In [None]:
!pip install transformers
!pip install sentence-transformers
!pip install datasets
!pip install evaluate

In [None]:
from torch.utils.data import Dataset, random_split
from pathlib import Path
from pandas import read_csv, concat
import numpy as np



class ScopeDataset(Dataset):
    def __init__(self, data_dir: str):
        data_dir = Path(data_dir)

        type_df = read_csv(data_dir / "entailment_scope_negative_sampling.csv")
        self.len = len(type_df)
        self.examples = type_df


    def __getitem__(self, i):
        label = self.examples.iloc[i].label

        return {
            "p1": self.examples.iloc[i].p1,
            "p2": self.examples.iloc[i].p2,
            "label": label,
        }

    def __len__(self):
        return len(self.examples)

In [None]:
dataset = ScopeDataset('./')

In [None]:
positives = [x for x in dataset if x['label'] == 1]
negatives = [x for x in dataset if x['label'] == 0]

In [None]:
len(dataset), len(positives), len(negatives)

In [None]:
import random
def getNegativeExamples(data):
    count = len(data)
    neg = []

    while count > 0:
      element1 = random.choice(data)
      p11 = element1['p1']

      element2 = random.choice(data)
      p12 = element2['p1']
      p22 = element2['p2']

      if p11 != p12 and p11 != p22:
        new_e = {}
        new_e['p1'] = p11
        new_e['p2'] = element2['p2']
        new_e['label'] = 0

        neg.append(new_e)

        count -= 1

    return neg

In [None]:
positives = [x for x in dataset if x['label'] == 1]
negatives = [x for x in dataset if x['label'] == 0]

In [None]:
len(dataset)

In [None]:
positives = ScopeDataset('./')

In [None]:
len(positives)

In [None]:
positives[5000]

In [None]:
negatives = getNegativeExamples(positives)
len(negatives)

In [None]:
len(negatives)

In [None]:
dataset = positives + negatives

In [None]:
len(dataset)

In [None]:
data = [x for x in dataset]

In [None]:
len(data)

In [None]:
import pandas as pd

df = pd.DataFrame(data)


In [None]:
df.to_csv('entailment_scope_negative_sampling.csv')

In [None]:
train_dataset = []
val_dataset = []
test_dataset = []

train_num = 60*len(positives) / 100
val_num = 15*len(positives) / 100

for i in range(len(positives)):
    if i <= train_num:
        train_dataset.append(positives[i])
        train_dataset.append(negatives[i])

    elif (len(positives) - i) <= val_num:
        val_dataset.append(positives[i])
        val_dataset.append(negatives[i])

    else:
        test_dataset.append(positives[i])
        test_dataset.append(negatives[i])




In [None]:
print(len(train_dataset), len(val_dataset), len(test_dataset))

In [None]:
pos = [x for x in test_dataset if x['label'] == 1]
neg = [x for x in test_dataset if x['label'] == 0]

print(len(pos), len(neg))

In [None]:
from datasets import load_dataset, Dataset, DatasetDict

# mydataset = Dataset.from_generator(dataset)

myTraindataset = Dataset.from_list(train_dataset)
myValdataset = Dataset.from_list(val_dataset)
myTestdataset = Dataset.from_list(test_dataset)


mydataset = DatasetDict({"train": myTraindataset, "validation": myValdataset, "test":myTestdataset})

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("glue", "mrpc")

# facebook/bart-base
#"bert-base-uncased"
#roberta-base
# nlpaueb/legal-bert-base-uncased
# mukund/privbert

checkpoint = "mukund/privbert"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["p1"], example["p2"], truncation=True)


tokenized_datasets = mydataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
mydataset

In [None]:
mydataset['train'][0]

In [None]:
from transformers import TrainingArguments, Trainer, logging

# training_args = TrainingArguments("test-trainer")

default_args = {
    "output_dir": "./",
    "evaluation_strategy": "epoch",
    "num_train_epochs": 6,
    # "log_level": "error",
    "logging_steps": 1,
    "log_level" : "info",
    "report_to": "none",
}

training_args = TrainingArguments(per_device_train_batch_size=64,
                                  # gradient_accumulation_steps=32,
                                  gradient_checkpointing=True,
                                  # optim="adafactor",
                                  **default_args,
                                )

In [None]:
training_args

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from datasets import load_metric
from sklearn.metrics import mean_squared_error
import evaluate
import numpy as np


accuracy = evaluate.load("accuracy")

# imdb = load_dataset("imdb")

f1 = evaluate.load("f1")

# imdb = load_dataset("imdb")


#F1
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    microF1 = f1.compute(predictions=predictions, references=labels, average='micro')
    macroF1 = f1.compute(predictions=predictions, references=labels, average='macro')
    weightF1 = f1.compute(predictions=predictions, references=labels, average='weighted')

    return {"micro-F1": microF1, "macroF1": macroF1, "weightF1": weightF1}


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainerCallback
import torch


trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    # callbacks=[GPUCallback()]
)

In [None]:
!nvidia-smi

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
from transformers import Trainer, TrainerCallback
import torch


trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    # callbacks=[GPUCallback()]
)

In [None]:
trainer.evaluate()