In [None]:
!pip install torch transformers datasets peft accelerate

In [None]:
import os
import shutil
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, Trainer,
                          TrainingArguments, get_scheduler)
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import autocast

In [None]:
print("Loading datasets...")
fitness_dataset = load_dataset("chibbss/fitness-chat-prompt-completion-dataset")
code_feedback_dataset = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction")

print(fitness_dataset)
print(code_feedback_dataset)

In [None]:
def preprocess_dataset(dataset, prefix, input_col, output_col):
    return dataset.map(lambda x: {
        "instruction": f"{prefix}: {x[input_col]}",
        "output": x[output_col]
    })

fitness_dataset = preprocess_dataset(fitness_dataset['train'], "[fitness]", "instruction", "output")
code_feedback_dataset = preprocess_dataset(code_feedback_dataset['train'], "[code_feedback]", "query", "answer")

combined_dataset = concatenate_datasets([fitness_dataset, code_feedback_dataset]).shuffle(seed=42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("D:/testing/text_guru_model")

def tokenize_function(examples):
    inputs = examples['instruction']
    targets = examples['output']
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True, padding='max_length', max_length=512)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['instruction', 'output']).train_test_split(test_size=0.1)

In [None]:
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained("D:/cuda/final_model")

lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, lora_config)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model is on device: {device}")

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    learning_rate=3e-5,
    warmup_steps=500,
    lr_scheduler_type="linear",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=False,
    logging_dir='./logs',
    logging_steps=10,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    max_grad_norm=1.0,
    disable_tqdm=False
)

In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(tokenized_datasets['train']) // (
    training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
) * training_args.num_train_epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

In [None]:
class CustomTrainer(Trainer):
    def save_checkpoint(self, output_dir=None):
        super().save_checkpoint(output_dir)
        checkpoints = sorted(
            [ckpt for ckpt in os.listdir(self.args.output_dir) if ckpt.startswith("checkpoint")],
            key=lambda x: int(x.split("-")[-1])
        )
        if len(checkpoints) > 5:
            for ckpt_to_delete in checkpoints[:-5]:
                shutil.rmtree(os.path.join(self.args.output_dir, ckpt_to_delete))
                print(f"Deleted checkpoint {ckpt_to_delete} to free up space")

In [None]:
!pip install tensorboard tensorboardX

In [None]:
print("Initializing Trainer...")
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    optimizers=(optimizer, lr_scheduler)
)

last_checkpoint = None
if os.path.isdir(training_args.output_dir):
    last_checkpoint = max(
        [os.path.join(training_args.output_dir, ckpt) for ckpt in os.listdir(training_args.output_dir) if ckpt.startswith("checkpoint")],
        key=os.path.getctime,
        default=None
    )

if last_checkpoint:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
else:
    print("No checkpoint found, starting training from scratch")

In [None]:
print("Starting training...")
trainer.train(resume_from_checkpoint=last_checkpoint)

In [None]:
print("Evaluating the model...")
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

In [None]:
save_path = "./text_guru_phase2"
print(f"Saving the model to {save_path}...")
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
def generate_with_cache(model, input_ids, past_key_values=None):
    output = model.generate(
        input_ids=input_ids,
        past_key_values=past_key_values,
        do_sample=True,
        max_length=100
    )
    return output

In [None]:
model_path = save_path
tokenizer_path = save_path

print("Loading saved model and tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    exit()

In [None]:
test_texts = [
    "What are some practical steps I can take to improve my overall health and well-being?",
    "What are some effective strategies for staying hydrated during workouts?",
    "Create a nested loop to print every combination of numbers between 0-9, excluding any combination that contains the number 5. Additionally, exclude any combination that contains a repeating digit. Implement the solution without using any built-in functions or libraries to check for repeating digits.",
    "Suggest how to make the following code more readable and efficient using list comprehensions. def has_duplicates(arr): return any([arr[i] == arr[j] for i in range(len(arr)) for j in range(i + 1, len(arr))])",
]

reference_texts = [
    "Develop a consistent exercise routine – Exercise is essential for physical and mental health. Aim for at least 30 minutes of physical activity five days a week. 2. Follow a healthy diet – Incorporate more fruits, vegetables, and whole grains into your diet while avoiding processed and fast foods. 3. Get enough sleep – Give your body time to rest and repair by getting the recommended seven to nine hours of sleep every night. 4. Practice relaxation techniques – Take a break to practice mindfulness, deep breathing, and other forms of relaxation to reduce stress and maintain emotional balance. 5. Talk",
    "Hydration during Exercise: Drink water before, during, and after workouts. Monitor urine color for hydration status. Include electrolytes for longer sessions. Dehydration impairs performance and recovery.",
    "Here is an example of a nested loop in Python to print every combination of numbers between 0-9, excluding any combination that contains the number 5 or repeating digits: ```python for i in range(10): # First digit for j in range(10): # Second digit for k in range(10): # Third digit # Checking for the conditions if i != 5 and j != 5 and k != 5 and i != j and i != k and j != k: print(i, j, k) ``` This code will generate and print every combination of three digits between 0-9 that do not contain the number 5 and do not have any repeating digits.",
    "One way to make the code more readable and efficient using list comprehensions is as follows: def has_duplicates(arr): return any(arr[i] == arr[j] for i in range(len(arr)) for j in range(i + 1, len(arr))) In this version, we removed the square brackets around the list comprehension. This is because we only need to check if any element in the list comprehension is True, so there's no need to create a list of booleans. This change also improves efficiency because it avoids creating an unnecessary list in memory. The any() function can directly evaluate the generator expression without having to create a list first.",
]

In [None]:
inputs = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        max_length=200,
        num_beams=5,
        early_stopping=True,
    )

decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

for i, (inp, ref, out) in enumerate(zip(test_texts, reference_texts, decoded_outputs)):
    print(f"Example {i + 1}:")
    print(f"Input: {inp}")
    print(f"Reference: {ref}")
    print(f"Output: {out}")
    print()

In [None]:
!pip install evaluate sacrebleu nltk rouge-score

In [None]:
import evaluate
print("Calculating BLEU score...")
bleu_metric = evaluate.load("sacrebleu")
bleu_score = bleu_metric.compute(predictions=decoded_outputs, references=[[ref] for ref in reference_texts])
print(f"BLEU Score: {bleu_score['score']}")

In [None]:
import nltk
nltk.download('punkt')
print("Calculating ROUGE scores...")
rouge_metric = evaluate.load("rouge")
rouge_scores = rouge_metric.compute(predictions=decoded_outputs, references=reference_texts)
print("ROUGE Scores:", rouge_scores)