## Another approach

In [1]:
!pip install transformers
!pip install torch
!pip install pandas



In [2]:
import pandas as pd
import re
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [3]:
# Load the Dataset
file_path = 'Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv'
data = pd.read_csv(file_path)

# Inspect the dataset
data.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [4]:
# Clean Text Data
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    return text.strip()

# Apply cleaning to 'instruction' and 'response' columns
data['instruction'] = data['instruction'].apply(clean_text)
data['response'] = data['response'].apply(clean_text)

# Handle missing values
data.dropna(subset=['instruction', 'response'], inplace=True)

In [5]:
# Prepare Data for Fine-Tuning
# Concatenate 'instruction' and 'response' for fine-tuning as input-output pairs
data['input_output'] = data['instruction'] + ' ' + data['response']

# Use a smaller subset of the dataset for quicker training
data_subset = data.sample(frac=0.1, random_state=42)

# Save the processed data to a text file
processed_data_path = 'processed_data.txt'
with open(processed_data_path, 'w') as f:
    for line in data_subset['input_output']:
        f.write(line + '\n')

In [6]:
# Prepare Data for Fine-Tuning
def load_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128)

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False)

tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

train_dataset = load_dataset(processed_data_path, tokenizer)
data_collator = create_data_collator(tokenizer)



In [7]:
# Load Dataset and Data Collator
def load_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128)

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False)

tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

train_dataset = load_dataset(processed_data_path, tokenizer)
data_collator = create_data_collator(tokenizer)

In [8]:
# Fine-Tune the GPT-2 Model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    # fp16=True,  # Disabled mixed precision training
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

print("Fine-tuning complete and model saved.")

Step,Training Loss
500,2.2579
1000,1.2728
1500,1.0238
2000,0.8865
2500,0.82
3000,0.7582


Fine-tuning complete and model saved.


In [22]:
# Generate Responses Using the Fine-Tuned Model
def generate_response(query, model, tokenizer, max_length=50):
    inputs = tokenizer.encode(query, return_tensors='pt').to(model.device)  # Move inputs to the same device as the model
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [33]:
# Integrate with a Chat Interface (simple text-based interface for demo)
def chat():
    print("Welcome to the Customer Service Chatbot. Type 'exit' to end the conversation.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break
        response = generate_response(user_input, model, tokenizer)
        print(f"Chatbot: {response}")

# Run the chat interface
chat()

Welcome to the Customer Service Chatbot. Type 'exit' to end the conversation.
You: exit
Chatbot: Goodbye!


In [24]:
# Load the Dataset
eval_file_path = 'Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv'
eval_data = pd.read_csv(eval_file_path)

# Clean Text Data
eval_data['instruction'] = eval_data['instruction'].apply(clean_text)
eval_data['response'] = eval_data['response'].apply(clean_text)

# Handle missing values
eval_data.dropna(subset=['instruction', 'response'], inplace=True)

# Concatenate 'instruction' and 'response' for fine-tuning as input-output pairs
eval_data['input_output'] = eval_data['instruction'] + ' ' + eval_data['response']

# Use a smaller subset of the dataset for quicker evaluation
eval_data_subset = eval_data.sample(frac=0.1, random_state=42)

# Save the processed data to a text file
eval_processed_data_path = 'eval_processed_data.txt'
with open(eval_processed_data_path, 'w') as f:
    for line in eval_data_subset['input_output']:
        f.write(line + '\n')

In [25]:
# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_gpt2'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Load Dataset and Data Collator
eval_dataset = load_dataset(eval_processed_data_path, tokenizer)
eval_data_collator = create_data_collator(tokenizer)

# Create the Trainer for evaluation
eval_trainer = Trainer(
    model=model,
    data_collator=eval_data_collator,
    eval_dataset=eval_dataset,
)

# Evaluate the model
eval_results = eval_trainer.evaluate()

# Calculate perplexity
eval_perplexity = torch.exp(torch.tensor(eval_results['eval_loss']))

print("Evaluation Results:", eval_results)
print("Evaluation Perplexity:", eval_perplexity.item())



Evaluation Results: {'eval_loss': 4.970292091369629, 'eval_runtime': 44.941, 'eval_samples_per_second': 54.939, 'eval_steps_per_second': 6.876}
Evaluation Perplexity: 144.0689697265625


In [27]:
eval_results

{'eval_loss': 4.970292091369629,
 'eval_runtime': 44.941,
 'eval_samples_per_second': 54.939,
 'eval_steps_per_second': 6.876}

In [28]:
eval_perplexity.item()

144.0689697265625

## A different approach with Tf-idf Vectorization

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = 'Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv'
dataset = pd.read_csv(file_path)

# Preprocess the instructions by fitting a TF-IDF vectorizer
vectorizer = TfidfVectorizer().fit(dataset['instruction'])

# Transform the instructions into TF-IDF vectors
instruction_vectors = vectorizer.transform(dataset['instruction'])

# Function to find the best matching response for a given user query
def generate_response(user_query):
    # Transform the user query into a TF-IDF vector
    query_vector = vectorizer.transform([user_query])
    
    # Compute cosine similarity between the user query vector and all instruction vectors
    similarities = cosine_similarity(query_vector, instruction_vectors).flatten()
    
    # Find the index of the most similar instruction
    best_match_index = similarities.argmax()
    
    # Return the corresponding response
    return dataset.iloc[best_match_index]['response']

# Main loop for the chatbot
def chatbot_main():
    print("Hello! How can I assist you today?")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ['exit', 'quit', 'bye']:
            print("Chatbot: Goodbye! Have a great day!")
            break
        response = generate_response(user_query)
        print(f"Chatbot: {response}")

# Start the chatbot
chatbot_main()

Hello! How can I assist you today?
You: I need to cancel purchase
Chatbot: I understood that you need assistance with canceling your purchase with the order number {{Order Number}}. We apologize for any inconvenience this may have caused. To initiate the cancellation process, please follow these steps:

1. Sign in to your account: Visit our {{Online Company Portal Info}} and login using your credentials.
2. Access your order history: Once logged in, navigate to the "{{Online Order Interaction}}" or "{{Online Order Interaction}}" section.
3. Locate the purchase: Look for the order with the specific order number {{Order Number}}.
4. Cancel the purchase: Click on the "{{Online Order Interaction}}" option associated with the purchase.
5. Confirm the cancellation: Follow any further prompts or questions from the system to finalize the cancellation.

If you encounter any difficulties or have further questions, our dedicated team is available to assist you. You can reach us during {{Customer 

In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = 'Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv'
dataset = pd.read_csv(file_path)

# Preprocess the instructions by fitting a TF-IDF vectorizer
vectorizer = TfidfVectorizer().fit(dataset['instruction'])

# Transform the instructions into TF-IDF vectors
instruction_vectors = vectorizer.transform(dataset['instruction'])

# Function to find the best matching response for a given user query
def generate_response(user_query):
    # Transform the user query into a TF-IDF vector
    query_vector = vectorizer.transform([user_query])
    
    # Compute cosine similarity between the user query vector and all instruction vectors
    similarities = cosine_similarity(query_vector, instruction_vectors).flatten()
    
    # Find the index of the most similar instruction
    best_match_index = similarities.argmax()
    
    # Return the corresponding response
    return dataset.iloc[best_match_index]['response']

# Main loop for the chatbot
def chatbot_main():
    print("Hello! How can I assist you today?")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ['exit', 'quit', 'bye']:
            print("Chatbot: Goodbye! Have a great day!")
            break
        response = generate_response(user_query)
        print(f"Chatbot: {response}")

# Start the chatbot
chatbot_main()

Hello! How can I assist you today?
You: i need to cancel my order
Chatbot: I've taken note that you would like to cancel your order with the number {{Order Number}}. I apologize for any inconvenience this may have caused.

To proceed with the cancellation, please follow these steps:

1. Log into your {{Online Company Portal Info}} using your credentials.
2. Navigate to the '{{Online Order Interaction}}' or '{{Online Order Interaction}}' section.
3. Locate the order with the number {{Order Number}} and click on it for more details.
4. Look for the option labeled '{{Online Order Interaction}}' and select it.
5. Confirm the cancellation if prompted by the system.

If you encounter any issues or have further questions, our dedicated team is ready to assist you. Please reach out during {{Customer Support Hours}} at {{Customer Support Phone Number}} or through the Live Chat feature on our {{Website URL}}.

Your satisfaction is important to us, and we appreciate your understanding.
You: I nee