In [4]:
print(dataset['train'][0].keys())

dict_keys(['conversation_id', 'turn_id', 'speaker', 'text'])


In [None]:
import os
os.environ['HF_TOKEN'] = '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'

# Install required libraries
!pip install transformers datasets torch accelerate

# Import necessary libraries
import torch
from datasets import load_dataset
from transformers import pipeline, Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator

# Load the dataset from Hugging Face
dataset = load_dataset("DigiRonin/testravenconv")

# Split the dataset into train and validation sets
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
dataset['validation'] = dataset.pop('test')

# Load DialoGPT model and tokenizer
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    # Access the 'text' field instead of 'question' and 'context'
    texts = [t.strip() if t is not None else "" for t in examples['text']]

    # Tokenize the inputs using the DialoGPT tokenizer
    inputs = tokenizer(
        texts,
        truncation=True,
        max_length=128,  # Adjust max_length as needed
        padding='max_length'
    )

    # The labels are the same as the input_ids for causal language modeling
    inputs['labels'] = inputs['input_ids'].copy()

    return inputs

# Preprocess the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Display the first few tokenized examples (Optional: for debugging)
for i in range(5):
    print(tokenized_dataset['train'][i])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

# Accelerate the training process
accelerator = Accelerator()
model, trainer, training_args = accelerator.prepare(model, trainer, training_args)

# Train the model
trainer.train()

# Function to generate a response using DialoGPT
def generate_response(user_input, chat_history_ids=None):
    # Add a persona to the conversation
    persona = "heyhey I'm Raven."
    new_user_input_ids = tokenizer.encode(persona + user_input + tokenizer.eos_token, return_tensors='pt')

    # Append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if chat_history_ids is not None else new_user_input_ids

    # Generate a response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature=0.8
    )

    # Decode the generated response
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response, chat_history_ids

# Interactive loop
chat_history_ids = None

print("You can start chatting with Raven. Type 'exit' or 'quit' to end the conversation.")
while True:
    try:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Goodbye!")
            break

        # Generate a response using DialoGPT
        response, chat_history_ids = generate_response(user_input, chat_history_ids)
        print(f"Raven: {response}")
    except Exception as e:
        print(f"An error occurred: {e}. Please try again.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'input_ids': [27991, 1472, 290, 20681, 12, 12463, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdigironin[0m ([33mdigironin-co[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
