In [None]:
!pip install pytesseract peft evaluate tqdm transformers datasets rouge-score accelerate nltk tensorboard jupyter-black py7zr --upgrade
!apt-get install git --yes
!apt-get install git-lfs --yes

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Collecting peft
  Using cached peft-0.5.0-py3-none-any.whl (85 kB)
Collecting evaluate
  Using cached evaluate-0.4.1-py3-none-any.whl (84 kB)
Collecting transformers
  Using cached transformers-4.34.0-py3-none-any.whl (7.7 MB)
Collecting datasets
  Using cached datasets-2.14.5-py3-none-any.whl (519 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Using cached tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Installing collected packages: tokenizers, transformers, datasets, evaluate, peft
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.27.2
    Uninstalling transformers-4.27.2:
      Successfully uninstalled transformers-4.27.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.11.0
    Uninstalling datasets-2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.10).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [None]:
import json
import pandas
import torch
import jupyter_black
from datetime import timedelta
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

MODEL_NAME = "C++ teacher AI"  # the name of your model
MODEL_ID = "google/flan-t5-large"  # the id of the base model we will train (can be small, base, large, xl, etc.) (the bigger - the more GPU memory you need)
NUM_TRAIN_EPOCHS = 4  # number of epochs to train

In [None]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
with open("data.json", "r") as f:
    data = json.load(f)

# create a dataframe
df = pandas.DataFrame(data)

data = Dataset.from_pandas(df).train_test_split(test_size=0.2)

In [None]:
def tokenize_function (example):
  print(len(example['question']))
  print(type(example))
  line_break = '\n'
  size = len(example['id'])

  prompt = [
      f"""
        {example['task'][i]}

        Question:
        {example['question'][i]}

        Maximum score:
        {example['maximum_score'][i]}

        Scoring guide:
        {line_break.join(str(str(z['point']) + ' points - ' + z['criteria']) for z in example['scoring_guide'][i])}

        Answer:
        {example['answer'][i]}

        Score:

      """
      for i in range(size)]

  for i in range(size):
    example['score'][i] = str(example['score'][i])

  example['input_ids'] = tokenizer (prompt, padding="max_length", truncation =True, return_tensors="pt").input_ids
  example['labels'] = tokenizer (example['score'], padding="max_length", truncation=True, return_tensors="pt").input_ids

  return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = data.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score'])

print(tokenized_datasets)


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

21
<class 'datasets.formatting.formatting.LazyBatch'>


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

6
<class 'datasets.formatting.formatting.LazyBatch'>
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 21
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 6
    })
})


In [None]:
def tokenize_function_v2 (example):
  #print(len(example['question']))
  #print(type(example))
  line_break = '\n'
  size = len(example['id'])

  prompt = [
      f"""Given the following task: '{example['task'][i]}' and the scoring guide: "{example['scoring_guide'][i]}" how would you assess the answer: "{example['assessment'][i]}” Assessment: """ for i in range(size)]

  example['input_ids'] = tokenizer (prompt, padding="max_length", truncation =True, return_tensors="pt").input_ids
  example['labels'] = tokenizer (example['assessment'], padding="max_length", truncation=True, return_tensors="pt").input_ids

  return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = data.map(tokenize_function_v2, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'task', 'scoring_guide', 'answer', 'assessment'])

print(tokenized_datasets['train'])
#print(tokenized_datasets['test'][1]['labels'])

#print(tokenizer.decode(tokenized_datasets['test'][2]['labels']).strip('<pad>').strip('</s>'))


###Peft

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8, #rank
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FLAN-T5
)

In [None]:
MODEL_ID

'google/flan-t5-large'

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = 0 if torch.cuda.is_available() else torch.device("cpu") #Ugyanaz mint a fenti
#peft_model.to(device)
#print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
output_dir = f'./peft-dialogue-summary-trainin'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=100,
    logging_steps=1,
    max_steps=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,

    train_dataset=tokenized_datasets['train']
)

In [None]:
peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



In [None]:
from peft import PeftModel
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './peft-dialogue-summary-checkpoint-local',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)


In [None]:
ind = 3

task = data['train'][ind]['task']
question = data['train'][ind]['question']
max_score = data['train'][ind]['maximum_score']
scoring_guide = data['train'][ind]['scoring_guide']
answer = data['train'][ind]['answer']
score = data['train'][ind]['score']
line_break = '\n'

prompt = f"""
{task}

Question:
{question}

Maximum score:
{max_score}

Scoring guide:
{line_break.join(str(str(i['point']) + " points - " + i['criteria']) for i in scoring_guide)}

Answer:
{answer}

Score:

"""

print(prompt)
print(score)


Rate the answer to the question.

Question:
What is the purpose of the 'const' keyword in C++? Provide examples.

Maximum score:
3

Scoring guide:
2 points - The 'const' keyword in C++ is used to indicate that a variable's value cannot be modified after it is initialized.
1 points - Example

Answer:
The 'const' keyword in C++ is used to indicate that a variable's value cannot be modified after it is initialized.

Score:


2


In [None]:
prompt = f"""
Rate the answer to the question.

Question:
Explain the difference between malloc() and new in C++.

Maximum score:
6

Scoring guide:
2 points – malloc() is a C function used for dynamic memory allocation, while new is a C++ operator that also allocates memory dynamically.

2 points – Memory allocated with malloc() must be explicitly deallocated with free(), whereas memory allocated with new should be deallocated with delete.

2 points – new is type-safe, whereas malloc() returns a void pointer, requiring explicit type casting.

Answer:
malloc() is a C function used for dynamic memory allocation, while new is a C++ operator that also allocates memory dynamically. new not only allocates memory but also calls the constructor for an object. Memory allocated with malloc() must be explicitly deallocated with free(), whereas memory allocated with new should be deallocated with delete. Additionally, new is type-safe, whereas malloc() returns a void pointer, requiring explicit type casting.

Score:

      """

In [None]:
inputs = tokenizer(prompt, return_tensors='pt')
input_ids = tokenizer(prompt, return_tensors='pt').input_ids


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(peft_model_text_output)

new


In [None]:
input_ids = tokenizer(prompt2, return_tensors="pt").input_ids.to("cuda")

outputs = original_model.generate(input_ids)