In [4]:
!pip install torch torchvision torchaudio
!pip install transformers datasets
!pip install gradio
!pip install accelerate sentencepiece

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# Importing AdamW from torch.optim instead of transformers
from torch.optim import AdamW
from tqdm import tqdm  # For progress bar
import gradio as gr

Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)
[0mInstalling collected packages: nvidia-cusparse-cu12
[0mSuccessfully installed nvidia-cusparse-cu12
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=2.0.0->accelerate)
  Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)
[0mInstalling collected packages: nvidia-cusparse-cu12
[0mSuccessfully installed nvidia-cusparse-cu12


In [5]:
# Initialize the model and tokenizer
model_name = "gpt2-large"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Dataset class for chat history
class ChatDataset(Dataset):
    def _init_(self, conversations, max_length=512):
        self.conversations = conversations
        self.max_length = max_length

    def _len_(self):
        return len(self.conversations)

    def _getitem_(self, idx):
        input_text = self.conversations[idx]
        input_ids = tokenizer.encode(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length).squeeze(0)
        return input_ids

# Train function with gradient clipping, learning rate scheduler, and loss logging
def train_model(conversations, epochs=3, batch_size=2, lr=5e-5, save_path="trained_chatbot"):
    dataset = ChatDataset(conversations)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=lr)

    # Learning rate scheduler (optional)
    total_steps = len(dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}"):
            batch = batch.to(device)
            optimizer.zero_grad()

            # Forward pass
            outputs = model(batch, labels=batch)
            loss = outputs.loss
            loss.backward()

            # Gradient clipping to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

    # Save the model and tokenizer
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Training complete and model saved to {save_path}.")

# Function to generate chatbot responses
def chatbot(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(
        input_ids,
        max_length=150,
        temperature=0.1,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,  # Avoids repeated n-grams
        do_sample=True  # Allows sampling for more diverse responses
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.strip()

# Launching Gradio interface
gr.Interface(fn=chatbot, inputs="text", outputs="text", live=False).launch(share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://720ec8975109e8bde5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


