In [None]:
!pip install pytesseract peft evaluate tqdm transformers datasets rouge-score accelerate nltk tensorboard jupyter-black py7zr --upgrade
!apt-get install git --yes
!apt-get install git-lfs --yes

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Collecting peft
  Using cached peft-0.5.0-py3-none-any.whl (85 kB)
Collecting transformers
  Using cached transformers-4.34.0-py3-none-any.whl (7.7 MB)
Collecting datasets
  Using cached datasets-2.14.5-py3-none-any.whl (519 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Using cached tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Installing collected packages: tokenizers, transformers, datasets, peft
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.27.2
    Uninstalling transformers-4.27.2:
      Successfully uninstalled transformers-4.27.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.11.0
    Uninstalling datasets-2.11.0:
      Successfully uninstalled datasets-2.11.0
  Attempting uninstall: peft
  

In [None]:
import json
import pandas
import jupyter_black
from datetime import timedelta
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "C++ teacher AI"  # the name of your model
MODEL_ID = "google/flan-t5-base"  # the id of the base model we will train (can be small, base, large, xl, etc.) (the bigger - the more GPU memory you need)
NUM_TRAIN_EPOCHS = 4  # number of epochs to train

In [None]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [None]:
with open("data.json", "r") as f:
    data = json.load(f)

# create a dataframe
df = pandas.DataFrame(data)

data = Dataset.from_pandas(df).train_test_split(test_size=0.1)

In [None]:
def tokenize_function (example):
  print(len(example['question']))
  print(type(example))
  line_break = '\n'
  size = len(example['id'])

  prompt = [
      f"""
        {example['task'][i]}

        Question:
        {example['question'][i]}

        Maximum score:
        {example['maximum_score'][i]}

        Scoring guide:
        {line_break.join(str(str(z['point']) + ' points - ' + z['criteria']) for z in example['scoring_guide'][i])}

        Answer:
        {example['answer'][i]}

        Score:

      """
      for i in range(size)]

  for i in range(size):
    example['score'][i] = str(example['score'][i])

  example['input_ids'] = tokenizer (prompt, padding="max_length", truncation =True, return_tensors="pt").input_ids
  example['labels'] = tokenizer (example['score'], padding="max_length", truncation=True, return_tensors="pt").input_ids

  return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = data.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score'])

print(tokenized_datasets)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

5
<class 'datasets.formatting.formatting.LazyBatch'>


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

1
<class 'datasets.formatting.formatting.LazyBatch'>
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
})


###Peft

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8, #rank
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FLAN-T5
)

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = 0 if torch.cuda.is_available() else torch.device("cpu") #Ugyanaz mint a fenti
#peft_model.to(device)
#print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
output_dir = f'./peft-dialogue-summary-trainin'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,

    train_dataset=tokenized_datasets['train'],
)

In [None]:
peft_trainer.train()

#peft_model_path="./peft-dialogue-summary-checkpoint-local"
#peft_trainer.model.save_pretrained(peft_model_path)
#tokenizer.save_pretrained(peft_model_path)

