In [5]:
# @title 1. Setup: Installing Necessary Libraries
# ===============================================
# We need 'sentence-transformers' for the models, 'datasets' to easily
# download data from the Hugging Face Hub, and 'accelerate' to help
# with training efficiency.

!pip install sentence-transformers datasets accelerate -q

print("✅ Installation complete.")

# @title 2. The Dataset: Finding and Loading Free Customer Service Data
# =======================================================================
# A good model needs good data. We will load a local CSV file
# from a specified Kaggle input directory.

from datasets import Dataset
import pandas as pd

# Load the dataset from the local CSV file path
try:
    file_path = '/content/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv'
    df = pd.read_csv(file_path)

    # Convert the pandas DataFrame to a Hugging Face Dataset object for compatibility
    dataset = Dataset.from_pandas(df)

    print("✅ Dataset loaded successfully from CSV!")

    # Let's see the structure of the loaded data
    print("\nDataset structure:")
    print(dataset)

    # Look at a few examples to understand the data
    print("\nSample examples from the dataset:")
    print(df.head())

except Exception as e:
    print(f"❌ Failed to load dataset from CSV. Error: {e}")
    print("Please ensure the file path is correct and the CSV file exists.")


# @title 3. Data Preprocessing: Creating Training Pairs
# =======================================================
# The goal of fine-tuning here is to teach the model that different questions
# ('instruction') with the same 'intent' should be semantically close to each other.
# We will create pairs of instructions that share the same intent. The model will
# learn to map these pairs to similar vector representations.

from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from collections import defaultdict
import random

# This cell will only run if the dataset was loaded successfully
if 'dataset' in locals():
    # Group instructions by their intent
    intent_groups = defaultdict(list)
    # The new dataset from CSV doesn't have a 'train' split, so we iterate directly
    for example in dataset:
        # The CSV uses 'intent' and 'instruction' columns
        intent_groups[example['intent']].append(example['instruction'])

    print(f"Found {len(intent_groups)} unique intents.")

    # Create training examples (pairs of sentences with the same intent)
    train_examples = []
    # We'll limit the number of pairs to make training faster on Colab.
    # You can increase this for better performance.
    NUM_TRAINING_PAIRS = 15000

    for _ in range(NUM_TRAINING_PAIRS):
        # Pick a random intent
        intent = random.choice(list(intent_groups.keys()))

        # Ensure the intent has at least two instructions to form a pair
        if len(intent_groups[intent]) > 1:
            # Pick two different random instructions from that intent
            sent1, sent2 = random.sample(intent_groups[intent], 2)
            train_examples.append(InputExample(texts=[sent1, sent2], label=1.0))

    print(f"\nCreated {len(train_examples)} training pairs.")
    if train_examples:
        print("Example training pair:")
        print(f"  Sentence 1: '{train_examples[0].texts[0]}'")
        print(f"  Sentence 2: '{train_examples[0].texts[1]}'")
        print(f"  Label: {train_examples[0].label} (1.0 means they are similar)")
else:
    print("Skipping preprocessing as dataset failed to load.")


# @title 4. Model and Training Setup
# ===================================
# We'll load a pre-trained sentence-transformer model. 'all-MiniLM-L6-v2' is an
# excellent starting point—it's small, fast, and performs well.
# Then, we define a 'loss function'. This tells the model how to adjust its
# weights during training. CosineSimilarityLoss is perfect for our task, as it
# encourages the model to produce high cosine similarity scores for our sentence pairs.

from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# This cell will only run if preprocessing was successful
if 'train_examples' in locals() and train_examples:
    # Load a pre-trained model
    model_name = 'all-MiniLM-L6-v2'
    model = SentenceTransformer(model_name)

    # Define the loss function
    # This loss function works on sentence pairs and aims to make their embeddings
    # have a high cosine similarity.
    train_loss = losses.CosineSimilarityLoss(model)

    # Create a DataLoader to batch our training examples
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

    print("✅ Model and Loss Function are ready.")
else:
    print("Skipping model setup as there are no training examples.")


# @title 5. Fine-Tuning the Model
# ================================
# This is where the actual training happens. We'll train for a small number of
# epochs. An epoch is one full pass through the entire training dataset.
# On a free Google Colab T4 GPU, this should take a few minutes.

import os

# This cell will only run if the model and dataloader are ready
if 'model' in locals() and 'train_dataloader' in locals():
    # Disable W&B logging to prevent the API key prompt
    os.environ["WANDB_DISABLED"] = "true"

    # Define training arguments
    # We specify the output directory to save the trained model
    output_dir = "models/customer-service-bot-finetuned"
    # Increased epochs for more thorough training
    num_train_epochs = 15
    warmup_steps = int(len(train_dataloader) * num_train_epochs * 0.1) # 10% of train data for warm-up

    print(f"Training for {num_train_epochs} epoch(s). This will take some time.")
    print(f"Warmup steps: {warmup_steps}")

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=num_train_epochs,
              warmup_steps=warmup_steps,
              output_path=output_dir,
              show_progress_bar=True)

    print(f"✅ Training complete. Model saved to '{output_dir}'.")
else:
    print("Skipping training as model setup failed.")


# @title 6. Using the Fine-Tuned Model for Inference
# ===================================================
# Now for the fun part! Let's use our newly specialized model.
# We will build a simple knowledge base from our original dataset and then
# use the model to find the best answer for a new user query.

from sentence_transformers import util
import torch

# This cell will only run if training was completed
if 'output_dir' in locals() and os.path.exists(output_dir):
    # Load the fine-tuned model from the directory we saved it in
    finetuned_model = SentenceTransformer(output_dir)
    print("Loaded fine-tuned model.")

    # Create a knowledge base of questions and their corresponding answers
    # We'll use the original dataframe for this, which has the correct columns.
    kb_questions = df['instruction'].tolist()
    kb_answers = df['response'].tolist()

    # Encode all knowledge base questions. This is done once.
    # The model will convert each question into a numerical vector (embedding).
    print("\nEncoding knowledge base... (This may take a moment)")
    kb_embeddings = finetuned_model.encode(kb_questions, convert_to_tensor=True, show_progress_bar=True)
    print("✅ Knowledge base encoded.")

    def get_best_answer(query):
        """
        This function takes a user query, finds the most relevant question
        in our knowledge base using our fine-tuned model, and returns the answer.
        """
        # 1. Encode the user's query
        query_embedding = finetuned_model.encode(query, convert_to_tensor=True)

        # 2. Use semantic_search to find the top N most similar questions
        # It performs a cosine similarity search between the query and the knowledge base
        hits = util.semantic_search(query_embedding, kb_embeddings, top_k=1)

        # The result is a list of lists, since we can search for multiple queries
        hit = hits[0][0]

        best_match_score = hit['score']
        best_match_index = hit['corpus_id']

        # 3. Return the best matching question and its answer
        return {
            'best_match': kb_questions[best_match_index],
            'answer': kb_answers[best_match_index],
            'score': best_match_score
        }

    # --- Let's test it! ---
    print("\n--- Testing the fine-tuned bot ---")
    user_query = "how long does it take to get my stuff?"

    result = get_best_answer(user_query)

    print(f"\nYour Query: '{user_query}'")
    print(f"Most Similar Question in KB: '{result['best_match']}' (Score: {result['score']:.4f})")
    print(f"Bot's Answer: '{result['answer']}'")

    print("\n--- Another Test ---")
    user_query = "i want to send my item back for a refund"
    result = get_best_answer(user_query)

    print(f"\nYour Query: '{user_query}'")
    print(f"Most Similar Question in KB: '{result['best_match']}' (Score: {result['score']:.4f})")
    print(f"Bot's Answer: '{result['answer']}'")
else:
    print("\nSkipping inference step because the model was not trained.")


✅ Installation complete.
✅ Dataset loaded successfully from CSV!

Dataset structure:
Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 26872
})

Sample examples from the dataset:
   flags                                        instruction category  \
0      B   question about cancelling order {{Order Number}}    ORDER   
1    BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2   BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3     BL         I need to cancel purchase {{Order Number}}    ORDER   
4  BCELN  I cannot afford this order, cancel purchase {{...    ORDER   

         intent                                           response  
0  cancel_order  I've understood you have a question regarding ...  
1  cancel_order  I've been informed that you have a question ab...  
2  cancel_order  I can sense that you're seeking assistance wit...  
3  cancel_order  I understood that you need assistance with can.

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0778
1000,0.0006
1500,0.0002
2000,0.0001
2500,0.0001
3000,0.0001
3500,0.0001
4000,0.0
4500,0.0
5000,0.0


✅ Training complete. Model saved to 'models/customer-service-bot-finetuned'.
Loaded fine-tuned model.

Encoding knowledge base... (This may take a moment)


Batches:   0%|          | 0/840 [00:00<?, ?it/s]

✅ Knowledge base encoded.

--- Testing the fine-tuned bot ---

Your Query: 'how long does it take to get my stuff?'
Most Similar Question in KB: 'how long does it take for a delivery to arrive?' (Score: 0.9999)
Bot's Answer: 'We understand that you're eager to know how long it takes for a delivery to arrive. The estimated delivery time can vary depending on various factors such as the shipping method, destination, and any unforeseen circumstances during transit. To provide you with an accurate estimate, may I please have the {{Tracking Number}} or {{Order Number}}? With this information, we can look up the details and provide you with the most up-to-date information regarding your delivery. Thank you for your understanding.'

--- Another Test ---

Your Query: 'i want to send my item back for a refund'
Most Similar Question in KB: 'want assistance to demand a refund of my money' (Score: 1.0000)
Bot's Answer: 'I see what you mean your frustration and the urgency in seeking a refund for y

In [6]:
# This command creates a zip file of your trained model folder.
!zip -r customer_service_bot_model.zip /content/models/customer-service-bot-finetuned

  adding: content/models/customer-service-bot-finetuned/ (stored 0%)
  adding: content/models/customer-service-bot-finetuned/model.safetensors (deflated 8%)
  adding: content/models/customer-service-bot-finetuned/1_Pooling/ (stored 0%)
  adding: content/models/customer-service-bot-finetuned/1_Pooling/config.json (deflated 57%)
  adding: content/models/customer-service-bot-finetuned/vocab.txt (deflated 53%)
  adding: content/models/customer-service-bot-finetuned/tokenizer.json (deflated 71%)
  adding: content/models/customer-service-bot-finetuned/README.md (deflated 65%)
  adding: content/models/customer-service-bot-finetuned/modules.json (deflated 62%)
  adding: content/models/customer-service-bot-finetuned/config.json (deflated 48%)
  adding: content/models/customer-service-bot-finetuned/2_Normalize/ (stored 0%)
  adding: content/models/customer-service-bot-finetuned/sentence_bert_config.json (deflated 4%)
  adding: content/models/customer-service-bot-finetuned/special_tokens_map.json