In [2]:
# !pip install torch
# !pip install tensorflow
# !pip install huggingface_hub
# !pip install transformers
# !pip install datasets
# !pip install accelerate
# !pip install bitsandbytes
# !pip install trl
# !pip install peft

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Select CUDA device index
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig

model_name = "google/flan-t5-large"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
tokenizer = AutoTokenizer.from_pretrained(model_name)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [5]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [6]:
from peft import LoraConfig, get_peft_model, TaskType


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


lora_config = LoraConfig(
    r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)


model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 787868672 || trainable%: 0.5989059049678777


In [7]:
def getFullAnswer(options,answer):
  lst = options.split(" , ")
  for i in lst:
    if str(i[0]) == answer:
      return i
  return "answer"

In [9]:
from datasets import load_from_disk
from datasets import concatenate_datasets
from datasets import Dataset, DatasetDict
import pandas as pd

def cleanData(data):
    df = data.to_pandas()
    for id in range(len(df.index)):
        if len(df["answer"][id]) > 256 or (df["correct"][id] == "answer"):
            df = df.drop(id)
    dataset = Dataset.from_pandas(df)
    return dataset

# Load the saved dataset
dataset = load_from_disk('my_dataset')
df = pd.read_csv('dataset.csv')
part1 = df.sample(frac=0.8, random_state=1)
remaining = df.drop(part1.index)
part2 = remaining.sample(frac=0.5, random_state=1)
part3 = remaining.drop(part2.index)

dataset = DatasetDict({'train': concatenate_datasets([dataset["train"],Dataset.from_pandas(part1)]),
                       'validation': concatenate_datasets([dataset["validation"],Dataset.from_pandas(part2)]), 
                       'test': concatenate_datasets([dataset["test"],Dataset.from_pandas(part3)])})
#data = load_dataset("math_qa")
#data = data.map(lambda samples: tokenizer(samples["Problem"]), batched=True)

#data_train = Dataset.from_dict({'answer': data["train"]["Rationale"]
                                # , 'correct' : [getFullAnswer(options,answer) for options,answer in zip(data["train"]["options"],data["train"]["correct"])]
                                # , 'label':["Correct" for _ in range(len(data["train"]))]})
#dataset_train = concatenate_datasets([dataset["train"], data_train])
# dataset = DatasetDict({'train': cleanData(dataset["train"]),'validation': cleanData(dataset["validation"]), 'test': cleanData(dataset["test"])})
dataset = dataset.map(
    lambda x: {"sentences": ["Context: " + correct + "\nAnswer: " + answer + "\nIs the answer Correct or Wrong?" 
                             for correct,answer in zip(x["correct"],x["answer"])]},
    batched=True,
    num_proc=1,
)
# loading dataset
# dataset = load_dataset("financial_phrasebank", "sentences_allagree")
# dataset = dataset["train"].train_test_split(test_size=0.1)
# dataset["validation"] = dataset["test"]
# del dataset["test"]

# classes = dataset["train"].features["label"].names
# dataset = dataset.map(
#     lambda x: {"text_label": [classes[label] for label in x["label"]]},
#     batched=True,
#     num_proc=1,
# )


Map:   0%|          | 0/680 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [10]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['answer', 'correct', 'label', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', '__index_level_0__', 'sentences'],
        num_rows: 680
    })
    validation: Dataset({
        features: ['answer', 'correct', 'label', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', '__index_level_0__', 'sentences'],
        num_rows: 110
    })
    test: Dataset({
        features: ['answer', 'correct', 'label', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', '__index_level_0__', 'sentences'],
        num_rows: 110
    })
})


In [11]:
print(dataset['train']['answer'][679])
print(dataset['train']['correct'][679])
print(dataset['train']['label'][679])
print(dataset['train']['sentences'][679])

2IZ>=(g>cw:q8v(R?:0NI
b) 40
Wrong
Context: b) 40
Answer: 2IZ>=(g>cw:q8v(R?:0NI
Is the answer Correct or Wrong?


In [12]:
# data preprocessing
text_column = "sentences"
label_column = "label"
max_length = 128


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

Running tokenizer on dataset:   0%|          | 0/680 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/110 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/110 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "temp",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    gradient_accumulation_steps=1,
    auto_find_batch_size=True,
    num_train_epochs=1,
    save_steps=100,
    save_total_limit=8,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

2024-04-20 23:56:14.398313: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 23:56:14.408206: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-20 23:56:14.445324: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment va

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.043684


TrainOutput(global_step=85, training_loss=0.4204018985523897, metrics={'train_runtime': 220.9283, 'train_samples_per_second': 3.078, 'train_steps_per_second': 0.385, 'total_flos': 394274789130240.0, 'train_loss': 0.4204018985523897, 'epoch': 1.0})

In [15]:
import random
id = 3
input_text = "Context: " + str(dataset["test"]["correct"][id]) + "\nAnswer: " + str(dataset["test"]["answer"][id]) + "\nIs the answer Correct or Wrong?"
#inputs = tokenizer(input_text, return_tensors="pt")
inputs = tokenizer(input_text, return_tensors="pt").to("cuda:0")
outputs = model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)

if response.find("Correct") != -1 and dataset["test"]["label"][id] == "Correct":
    print("True")
elif response.find("Wr") != -1 and dataset["test"]["label"][id] == "Wrong":
    print("True")
else:
    print("False")

print("input sentence: ", input_text)
print(" output prediction: ", response)
#print(" output prediction: ", tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

True
input sentence:  Context: d ) 292
Answer: so 12.2 * 24 = 292 . . imo option d is correct answer . .
Is the answer Correct or Wrong?
 output prediction:  Correct


In [16]:
accuracy = 0
for id in range(len(dataset["test"]["answer"])):
    fullAnswer = str(dataset["test"]["correct"][id])
    studentAnswer = str(dataset["test"]["answer"][id])
    input_text = "Context: " + fullAnswer + "\nAnswer: " + studentAnswer + "\nIs the answer Correct or Wrong?"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda:0")
    outputs = model.generate(**inputs)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if response.find("Correct") != -1 and dataset["test"]["label"][id] == "Correct":
        accuracy += 1
    elif response.find("Wr") != -1 and dataset["test"]["label"][id] == "Wrong":
        accuracy += 1
    
print("Accuracy: "+ str(100*accuracy/len(dataset["test"]["answer"])) + "%")

Accuracy: 99.0909090909091%


In [17]:
correct = 0
wrong = 0
for id in range(len(dataset["test"]["answer"])):
    if dataset["test"]["label"][id] == "Wrong":
        wrong+=1
    if dataset["test"]["label"][id] == "Correct":
        correct+=1
print("Correct: "+ str(100*correct/len(dataset["test"]["answer"])) + "%")
print("Wrong: "+ str(100*wrong/len(dataset["test"]["answer"])) + "%")

Correct: 51.81818181818182%
Wrong: 48.18181818181818%


In [18]:
model.save_pretrained("teacherModelV2",safe_serialization=False)

In [19]:
import shutil
import zipfile

# Create a zip file of the folder
shutil.make_archive('teacherModelV2', 'zip', 'teacherModelV2')

'/home/student/teacherModelV2.zip'