In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.optimization import AdamW
from tqdm import tqdm

# Ensure GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load OPT model
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Set padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load QA pairs from file (format: question[TAB]answer)
def load_qa_dataset(file_path):
    qa_pairs = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if "?" in line:
                parts = line.strip().split("?", 1)  # split only at the first ?
                question = parts[0].strip() + "?"
                answer = parts[1].strip()
                qa_pairs.append(f"Question: {question} Answer: {answer}")
    return qa_pairs


# Tokenize
def tokenize_function(text, max_length=256):
    return tokenizer(text, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")

# Dataset class
class QADataset(Dataset):
    def __init__(self, qa_lines):
        self.examples = []
        for line in qa_lines:
            tokenized = tokenize_function(line)
            input_ids = tokenized["input_ids"].squeeze(0)
            attention_mask = tokenized["attention_mask"].squeeze(0)
            labels = input_ids.clone()
            labels[labels == tokenizer.pad_token_id] = -100

            self.examples.append({
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return {k: v.to(device) for k, v in self.examples[idx].items()}

# Load and prepare dataset
file_path = "/content/sample_data/college_history.txt"  # Each line should be: question<TAB>answer
qa_lines = load_qa_dataset(file_path)
dataset = QADataset(qa_lines)

train_size = int(0.9 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=4)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Training
epochs = 6
model.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    total_loss = 0
    for batch in loop:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Avg Loss: {avg_loss:.4f}")

# Save model
save_path = "/content/opt_collegebot"
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Model saved to {save_path}")

# Chatbot Function
def chat(question, max_length=100):
    model.eval()
    prompt = f"Question: {question} Answer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example Interaction
user_question = "When was MANIT established?"
response = chat(user_question)
print("💬 Bot Response:")
print(response)



# Save model
save_path = "/content/opt_collegebot"
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Model saved to {save_path}")

# Zip and download
!zip -r /content/opt_collegebot.zip /content/opt_collegebot
from google.colab import files
files.download("/content/opt_collegebot.zip")
