In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:



import os
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from huggingface_hub import login
import torch
from torch.utils.data import Dataset, DataLoader
import wandb

# Set your key

wandb.login(key='Replace_Key') # Weights & Biases (wandb) for experiment tracking and uses environment variables for API key management

API_KEY = "Replace_Key"  # Replace with a temporary key for Colab
os.environ["API_KEY"] = API_KEY  # Set environment variable for the session

# Optionally log in to Hugging Face (if not done globally)
# login()

# Load the MultiWOZ v22 dataset and select a sample (e.g., 100 samples)
dataset = load_dataset("multi_woz_v22")

# Select a smaller sample for testing (e.g., 100 samples)
sample_size = 100
train_sample = dataset['train'].select(range(sample_size))
valid_sample = dataset['validation'].select(range(20))  # Select 20 samples for validation

# Print an example dialogue to see the format
print(train_sample[0])  # View the first conversation

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-small')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define the preprocessing function
def preprocess_function(examples):
    """
    Preprocesses and tokenizes the conversation data.
    Turns each dialogue in the multi-woz dataset into a "Customer: ... Support: ..." format.

    Args:
        examples: A dictionary containing dialogue information.

    Returns:
        A dictionary containing tokenized input IDs, attention masks, and labels.
    """
    # We assume that 'turns' contains the conversation turns with the speaker and utterance
    concatenated_examples = []

    for dialogue in examples['turns']:
        dialogue_text = []
        for speaker, utterance in zip(dialogue['speaker'], dialogue['utterance']):
            if speaker == 0:  # User (0 is usually the user in this dataset)
                dialogue_text.append(f"Customer: {utterance}")
            else:  # Agent (1 is usually the agent in this dataset)
                dialogue_text.append(f"Support: {utterance}")

        # Join all the turns for the conversation into one string
        concatenated_examples.append(" ".join(dialogue_text))

    return tokenizer(concatenated_examples, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Preprocess the sample dataset
tokenized_inputs = train_sample.map(preprocess_function, batched=True, remove_columns=train_sample.column_names)
valid_tokenized_inputs = valid_sample.map(preprocess_function, batched=True, remove_columns=valid_sample.column_names)

# Creating a validation set from the training dataset (90% for training, 10% for validation)
train_size = int(0.9 * len(tokenized_inputs))  # 90% for training
train_dataset = tokenized_inputs.select(range(train_size))  # First 90%
valid_dataset = tokenized_inputs.select(range(train_size, len(tokenized_inputs)))  # Last 10%

class ConversationDataset(Dataset):
    """
    Custom dataset class to handle tokenized conversation data.
    """
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']

    def __getitem__(self, idx):
        """
        Returns a data sample for a given index.
        """
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx]  # Labels are the same as input_ids for language modeling
        }

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.input_ids)

# Create dataset objects for training and validation
train_data = ConversationDataset(train_dataset)
valid_data = ConversationDataset(valid_dataset)

# Load the pre-trained model (DialoGPT-small)
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for checkpoints and models
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=2,   # Batch size per device during training
    per_device_eval_batch_size=2,    # Batch size per device during evaluation
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
)

# Create the Trainer
trainer = Trainer(
    model=model,                         # The pre-trained model to fine-tune
    args=training_args,                  # Training arguments
    train_dataset=train_data,            # Training dataset
    eval_dataset=valid_data             # Validation dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_pretrained("./customer_support_chatbot")
tokenizer.save_pretrained("./customer_support_chatbot")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manvcse2007[0m ([33manvcse2007-university-of-illinois-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

multi_woz_v22.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/8437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'dialogue_id': 'PMUL4398.json', 'services': ['restaurant', 'hotel'], 'turns': {'turn_id': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'speaker': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], 'utterance': ['i need a place to dine in the center thats expensive', 'I have several options for you; do you prefer African, Asian, or British food?', 'Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?', 'There is an Afrian place named Bedouin in the centre. How does that sound?', 'Sounds good, could I get that phone number? Also, could you recommend me an expensive hotel?', "Bedouin's phone is 01223367660. As far as hotels go, I recommend the University Arms Hotel in the center of town.", 'Yes. Can you book it for me?', 'Sure, when would you like that reservation?', 'i want to book it for 2 people and 2 nights starting from saturday.', 'Your booking was successful. Your reference number is FRGZWQL2 . May I help you

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,2.1466,1.147588
2,1.854,1.006987
3,1.2573,0.975803


AttributeError: 'Trainer' object has no attribute 'save_pretrained'

In [3]:
model.save_pretrained("./customer_support_chatbot")
tokenizer.save_pretrained("./customer_support_chatbot")

('./customer_support_chatbot/tokenizer_config.json',
 './customer_support_chatbot/special_tokens_map.json',
 './customer_support_chatbot/vocab.json',
 './customer_support_chatbot/merges.txt',
 './customer_support_chatbot/added_tokens.json')

In [5]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model
model_path = './customer_support_chatbot'  # Path to your fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Function to generate text based on a prompt
def generate_response(prompt, max_length=50):
    """
    Generates a response based on the given prompt using the fine-tuned chatbot model.

    Args:
        prompt: The input prompt for the chatbot.
        max_length: The maximum length of the generated response.

    Returns:
        The generated response text.
    """
    # Encode the prompt and create attention mask
    inputs = tokenizer(prompt, return_tensors='pt', padding=True)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    # Generate output
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask, # Pass the attention mask here
        max_length=max_length + len(input_ids[0]),
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        repetition_penalty=2.5,
        top_p=0.92,
        temperature=0.85,
        do_sample=True,
        top_k=125,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and return the output text
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage for customer support
prompt = "Customer: I'm having trouble logging into my account. Support:"
response = generate_response(prompt)
print(response)

prompt = "Customer: My order hasn't arrived yet. Support:"
response = generate_response(prompt)
print(response)

Customer: I'm having trouble logging into my account. Support: There is no problem with the password and address information? Customer : It has been successful, thank you very much for your help!
Customer: My order hasn't arrived yet. Support: I'd like to know the address of your nearest restaurant please! Customer : Does it have free parking for 3 people and a car? Is there any type or price range you would recommend seeing on that day, in particular? No entry fee. Phone
