In [None]:
!pip install -U datasets
!pip install transformers
!pip install torch
!pip install sklearn

import random
import torch
import datasets
import re
import json
import os
from transformers import Trainer, TrainingArguments, GPT2DoubleHeadsModel,GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

In [None]:
print(torch.cuda.is_available())

In [None]:
def load_model():
    model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
    return model

model = load_model()

In [None]:
def load_local_model():
    model = GPT2DoubleHeadsModel.from_pretrained("../input/model3/", local_files_only=True)
    model.to("cuda")
    return model

model = load_local_model()



In [None]:
def load_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<TITLE>']} 
    tokenizer.add_special_tokens(special_tokens)
    return tokenizer

tokenizer = load_tokenizer()
model.resize_token_embeddings(len(tokenizer))

In [None]:
import csv
import os, glob
import pandas as pd

def edit_dataset():

    with open("../input/new-title-abstract-pairs/new_title_abstract_pairs.csv", newline="") as csvfile: #input path of the dataset
      reader = csv.reader(csvfile, delimiter=",")

      lines = []
      for row in reader:
        lines.append(row[2] + " <TITLE> " + row[1] + ' <|endoftext|> ')

      print("Max length:", max([len(line) for line in lines]))
      print("Average length:", sum([len(line) for line in lines])/len(lines))  

      lines[0] = "inputs"
      with open("Advaced_combined.csv", 'w', newline='') as newfile: # target file
        writer = csv.writer(newfile, delimiter=',')
        for line in lines:
          if len(line) <= 1024:
            writer.writerow([line])
    
edit_dataset()


In [None]:
def load_dataset(tokenizer, path, size=-1):
    dataset = datasets.load_dataset("csv", data_files=path)
    print(dataset)

    def tokenize_input(examples):
        inputs = tokenizer(examples["inputs"], max_length=1024, padding="max_length", truncation=True)

        return inputs

    train_dataset = dataset.map(tokenize_input, batched=True)

    train, test = train_test_split(train_dataset["train"],test_size=0.3)
    
    train_path = "train_dataset.txt"
    test_path = "test_dataset.txt"
    with open(train_path,"w") as outfile:
      json.dump(train, outfile)
    with open(test_path,"w") as outfile:
      json.dump(test, outfile)  
    

    train_dataset = TextDataset(
           tokenizer=tokenizer,
           file_path=train_path,
           block_size=128)
 
    test_dataset = TextDataset(
           tokenizer=tokenizer,
           file_path=test_path,
           block_size=128)
 
    data_collator = DataCollatorForLanguageModeling(
         tokenizer=tokenizer, mlm=False,
     )
    return train_dataset,test_dataset,data_collator,test


train_dataset, eval_dataset,data_collator,test = load_dataset(tokenizer, path="./Advaced_combined.csv", size=100)
print(train_dataset)
print(eval_dataset)
print(data_collator)

In [None]:
def train_model(model, train_dataset, eval_dataset, data_collator):
    
    
    training_args = TrainingArguments(output_dir="./output",
                                      do_train=True,
                                      evaluation_strategy="epoch",
                                      num_train_epochs=4,
                                      per_device_train_batch_size=32,
                                      per_device_eval_batch_size=64,
                                      fp16=False,
                                      learning_rate=3e-5,
                                      gradient_accumulation_steps=8,

                                      
                                      report_to=None)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator = data_collator
    )

   
    

    trainer.train()


train_model(model, train_dataset, eval_dataset, data_collator)



In [None]:
model.save_pretrained("./")

In [None]:
def compute_bertScore(model, train_dataset, eval_dataset, data_collator):
    model.eval()
    metric = datasets.load_metric("bertscore")
    gpu_usage()
    torch.cuda.empty_cache()
    gpu_usage()

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        print("predicitons:")
        print(predictions)
        print("labels: ")
        print(labels)
        return metric.compute(predictions=predictions, references=labels)

    training_args = TrainingArguments("test_trainer")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.evaluate()
compute_bertScore(model, train_dataset, eval_dataset, data_collator)