<a href="https://colab.research.google.com/github/BettyBopSleeping/jarv/blob/main/Zalo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import json
training_data = [
    {"prompt": "User: What is a resistor?\n", "completion": "Zalo: A resistor is a passive component that resists the flow of electric current. It’s used to control voltage and current in circuits.\n"},
{"prompt": "User: What's the purpose of a capacitor?\n", "completion": "Zalo: A capacitor stores electrical energy temporarily. It’s great for filtering signals or supplying short bursts of power.\n"},
{"prompt": "User: How does a diode work?\n", "completion": "Zalo: A diode only allows current to flow in one direction. It’s like a one-way valve for electricity.\n"},
{"prompt": "User: Can you explain what a transistor does?\n", "completion": "Zalo: A transistor acts like a switch or amplifier. It can control large currents with a small input signal.\n"},
{"prompt": "User: What’s the difference between AC and DC?\n", "completion": "Zalo: AC is alternating current—it changes direction. DC is direct current—it flows in one direction only. Think wall socket vs battery.\n"},
{"prompt": "User: What is voltage?\n", "completion": "Zalo: Voltage is the electric potential difference. It’s like the pressure that pushes electrons through a wire.\n"},
{"prompt": "User: What is current?\n", "completion": "Zalo: Current is the flow of electric charge. Measured in amps, it tells you how much electricity is moving.\n"},
{"prompt": "User: What is resistance?\n", "completion": "Zalo: Resistance opposes current flow. It's like friction in a pipe—more resistance, less current.\n"},
{"prompt": "User: What’s Ohm’s Law?\n", "completion": "Zalo: Ohm’s Law states that V = I × R. Voltage equals current times resistance. It's foundational in circuit analysis.\n"},
{"prompt": "User: What is an LED?\n", "completion": "Zalo: An LED is a Light Emitting Diode. It lights up when current flows through it in the right direction.\n"},
{"prompt": "User: What is a breadboard?\n", "completion": "Zalo: A breadboard is a reusable platform for testing circuits without soldering. It’s every inventor’s playground.\n"},


]

data_path = "/content/training_data.jsonl"
with open(data_path, "a", encoding="utf-8") as outfile:
    for entry in training_data:
        json.dump(entry, outfile, ensure_ascii=False)
        outfile.write("\n")

validation_data = [
    {"prompt": "User: What is a resistor?\n", "completion": "Zalo: A resistor is a passive component that resists the flow of electric current. It’s used to control voltage and current in circuits.\n"},
{"prompt": "User: What's the purpose of a capacitor?\n", "completion": "Zalo: A capacitor stores electrical energy temporarily. It’s great for filtering signals or supplying short bursts of power.\n"},
{"prompt": "User: How does a diode work?\n", "completion": "Zalo: A diode only allows current to flow in one direction. It’s like a one-way valve for electricity.\n"},
{"prompt": "User: Can you explain what a transistor does?\n", "completion": "Zalo: A transistor acts like a switch or amplifier. It can control large currents with a small input signal.\n"},
{"prompt": "User: What’s the difference between AC and DC?\n", "completion": "Zalo: AC is alternating current—it changes direction. DC is direct current—it flows in one direction only. Think wall socket vs battery.\n"},
{"prompt": "User: What is voltage?\n", "completion": "Zalo: Voltage is the electric potential difference. It’s like the pressure that pushes electrons through a wire.\n"},
{"prompt": "User: What is current?\n", "completion": "Zalo: Current is the flow of electric charge. Measured in amps, it tells you how much electricity is moving.\n"},
{"prompt": "User: What is resistance?\n", "completion": "Zalo: Resistance opposes current flow. It's like friction in a pipe—more resistance, less current.\n"},
{"prompt": "User: What’s Ohm’s Law?\n", "completion": "Zalo: Ohm’s Law states that V = I × R. Voltage equals current times resistance. It's foundational in circuit analysis.\n"},
{"prompt": "User: What is an LED?\n", "completion": "Zalo: An LED is a Light Emitting Diode. It lights up when current flows through it in the right direction.\n"},
{"prompt": "User: What is a breadboard?\n", "completion": "Zalo: A breadboard is a reusable platform for testing circuits without soldering. It’s every inventor’s playground.\n"},


]

data_path = "/content/validation_data.jsonl"
with open(data_path, "a", encoding="utf-8") as outfile:
    for entry in validation_data:
        json.dump(entry, outfile, ensure_ascii=False)
        outfile.write("\n")

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
import json
import torch
from torch.utils.data import Dataset
import numpy as np
import nltk
from nltk.corpus import wordnet
import random
import os
from tqdm import tqdm

# Download wordnet for data augmentation
nltk.download('wordnet')

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()  # For language modeling, labels are the same as inputs
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

# Simple word replacement augmentation function
def augment_text(text, prob=0.1):
    words = text.split()
    result = []

    for word in words:
        if random.random() < prob and len(word) > 3:  # Only replace some words
            synonyms = []
            for syn in wordnet.synsets(word):
                for lemma in syn.lemmas():
                    synonyms.append(lemma.name())

            if synonyms:
                result.append(random.choice(synonyms))
            else:
                result.append(word)
        else:
            result.append(word)

    return ' '.join(result)

# Create output directories
os.makedirs('./results', exist_ok=True)
os.makedirs('./logs', exist_ok=True)

# Load pre-trained GPT-2 model and tokenizer
print("Loading pre-trained model and tokenizer...")
model_name = 'gpt2'  # You can change this to 'gpt2-medium' for better results if you have enough resources
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the tokenizer has the pad token set
tokenizer.pad_token = tokenizer.eos_token

print("Loading training and validation data...")
# Load the training data
with open('/content/training_data.jsonl', 'r') as file:
    cleaned_data = [json.loads(line) for line in file]

# Load the validation data
with open('/content/validation_data.jsonl', 'r') as file:
    validation_data = [json.loads(line) for line in file]

# Extract and format prompts from training data
train_prompts = [f"<{entry['prompt']}" for entry in cleaned_data if "prompt" in entry]

# Extract and format prompts from validation data
validation_prompts = [f"{entry['prompt']}" for entry in validation_data if "prompt" in entry]

# Data augmentation for training set
print("Performing data augmentation...")
augmented_prompts = []
for prompt in tqdm(train_prompts):
    # Add original prompt
    augmented_prompts.append(prompt)
    # Add augmented version
    try:
        augmented_prompts.append(augment_text(prompt))
    except:
        pass  # Skip if augmentation fails

# Use augmented data for training
train_prompts = augmented_prompts
print(f"Training data size after augmentation: {len(train_prompts)}")

# Tokenize training data
print("Tokenizing data...")
train_encodings = tokenizer(
    train_prompts,
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors='np',
    return_attention_mask=True
)

# Tokenize validation data
validation_encodings = tokenizer(
    validation_prompts,
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors='np',
    return_attention_mask=True
)

# Create proper dataset objects
train_dataset = TextDataset(train_encodings)
validation_dataset = TextDataset(validation_encodings)

# Define training arguments with improved parameters
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",      # Evaluate during training
    eval_steps=100,                   # Evaluate every 100 steps
    learning_rate=2e-5,               # Standard learning rate for fine-tuning
    per_device_train_batch_size=4,    # Batch size per device during training
    gradient_accumulation_steps=4,    # Simulate larger batch sizes
    per_device_eval_batch_size=4,     # Batch size for evaluation
    weight_decay=0.01,                # Weight decay for regularization
    num_train_epochs=5,               # Number of training epochs
    logging_dir='./logs',             # Directory for logs
    logging_steps=10,                 # Log every 10 steps
    save_steps=100,                   # Save checkpoint every 100 steps
    save_total_limit=3,               # Keep only the 3 best checkpoints
    load_best_model_at_end=True,      # Load the best model at the end
    metric_for_best_model="eval_loss", # Use loss as the metric
    greater_is_better=False,          # Lower loss is better
    warmup_ratio=0.1,                 # Warm up over 10% of training
    fp16=True,                        # Use mixed precision if available
    report_to="tensorboard",          # Report to TensorBoard
)

# Add early stopping
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Prepare the Trainer
print("Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    callbacks=[early_stopping_callback],
)

# Start training
print("Starting training...")
trainer.train()

# Evaluate the model
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the trained model and tokenizer
print("Saving the fine-tuned model and tokenizer...")
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')
print("Fine-tuning complete! Model and tokenizer saved to ./final_model")

def generate_text(prompt, max_length=100):
    # Get the device the model is on
    device = model.device

    # Encode and move to the same device as the model
    input_ids = tokenizer.encode(f"{prompt}", return_tensors="pt").to(device)

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,  # Prevent repeating 3-grams
        repetition_penalty=1.5
    )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0])
    return generated_text

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.3,
        top_k=49,
        top_p=0.98,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Test the model with a few examples
print("\nTesting the fine-tuned model with example prompts:")
test_prompts = [
    "Who made Ohm's Law?",
    "What is a transistor",
    "What is a LED?"
]

for prompt in test_prompts:
    generated = generate_text(prompt)
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated}")
    print("-" * 50)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading pre-trained model and tokenizer...
Loading training and validation data...
Performing data augmentation...


100%|██████████| 22/22 [00:08<00:00,  2.62it/s]


Training data size after augmentation: 44
Tokenizing data...
Setting up training arguments...
Initializing trainer...
Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
