## 1. Dependencies and Imports

First, let's import all the necessary libraries for our chatbot implementation:

# Intent Analysis Model
This notebook is the proper implementation on finetuning our own dataset on distillbert for intent analysis.

## Overview
- Intent classification using DistilBERT
- Response generation based on intents

# Importing the Libraries

In [26]:
# Load the necessary libraries
import json
import pickle
import re
from datetime import datetime, timedelta
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

# Building Custom Dataset

In [27]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Training a Intent Analysis Model

In [28]:
class BookingChatbot:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.label_encoder = {}
        self.reverse_label_encoder = {}
        self.conversation_state = {}
        
    def load_data(self, data_path):
        with open(data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        texts = [item['text'] for item in data]
        intents = [item['intent'] for item in data]
        
        unique_intents = list(set(intents))
        self.label_encoder = {intent: idx for idx, intent in enumerate(unique_intents)}
        self.reverse_label_encoder = {idx: intent for intent, idx in self.label_encoder.items()}
        
        labels = [self.label_encoder[intent] for intent in intents]
        
        return texts, labels
    
    def train_model(self, data_path):
        from transformers import TrainerCallback, EarlyStoppingCallback
        
        print("Loading training data...")
        texts, labels = self.load_data(data_path)
        
        print("Initializing DistilBERT...")
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        
        num_labels = len(self.label_encoder)
        self.model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=num_labels
        )
        
        print("Splitting data...")
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            texts, labels, test_size=0.2, random_state=42
        )
        
        print("Creating datasets...")
        train_dataset = IntentDataset(train_texts, train_labels, self.tokenizer)
        val_dataset = IntentDataset(val_texts, val_labels, self.tokenizer)
        
        # Custom callback to update tqdm
        class TqdmCallback(TrainerCallback):
            def __init__(self, progress_bar):
                self.pbar = progress_bar
                
            def on_epoch_end(self, args, state, control, **kwargs):
                self.pbar.update(1)
                # Update progress bar with current metrics
                if state.log_history:
                    last_log = state.log_history[-1]
                    postfix = {
                        'epoch': f"{state.epoch:.0f}/{args.num_train_epochs}",
                    }
                    if 'train_loss' in last_log:
                        postfix['loss'] = f"{last_log['train_loss']:.4f}"
                    if 'eval_loss' in last_log:
                        postfix['val_loss'] = f"{last_log['eval_loss']:.4f}"
                    self.pbar.set_postfix(postfix)
        
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=20,  # Increased epochs for early stopping
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            save_strategy="epoch",
            eval_strategy="epoch",
            load_best_model_at_end=True,
            logging_steps=10,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stop if no improvement for 3 epochs
        )
        
        print("Training model with early stopping...")
        # Add tqdm progress bar for epochs
        with tqdm(total=training_args.num_train_epochs, desc="Training Progress", unit="epoch") as pbar:
            trainer.add_callback(TqdmCallback(pbar))
            trainer.train()
        
        print(f"\nTraining completed after {trainer.state.epoch} epochs")
        print("Evaluating model...")
        with tqdm(desc="Evaluating", total=1) as eval_pbar:
            predictions = trainer.predict(val_dataset)
            eval_pbar.update(1)
        
        predicted_labels = np.argmax(predictions.predictions, axis=1)
        accuracy = accuracy_score(val_labels, predicted_labels)
        print(f"Validation Accuracy: {accuracy:.4f}")
        
        return self.model, self.tokenizer
    
    def predict_intent(self, text):
        if not self.model or not self.tokenizer:
            raise ValueError("Model not trained or loaded!")
        
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_label = torch.argmax(predictions, dim=-1).item()
            confidence = predictions[0][predicted_label].item()
        
        intent = self.reverse_label_encoder[predicted_label]
        return intent, confidence
    
    def extract_datetime(self, text):
        # Simple datetime extraction patterns
        patterns = {
            r'(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{4})\s+(\d{1,2})\s*(am|pm)': 'date_time',
            r'tomorrow\s+at\s+(\d{1,2})\s*(am|pm)': 'tomorrow',
            r'next\s+(monday|tuesday|wednesday|thursday|friday|saturday|sunday)\s+(\d{1,2})\s*(am|pm)': 'next_weekday',
            r'(\d{1,2})\s*(am|pm)': 'time_only',
        }
        
        text_lower = text.lower()
        for pattern, pattern_type in patterns.items():
            match = re.search(pattern, text_lower, re.IGNORECASE)
            if match:
                return match.group(0), pattern_type
        
        return None, None
    
    def generate_response(self, intent, text, user_id='default'):
        if user_id not in self.conversation_state:
            self.conversation_state[user_id] = {'context': None, 'awaiting': None}
        
        state = self.conversation_state[user_id]
        
        responses = {
            'greeting': "Hello! I'm here to help you with your booking needs. How can I assist you today?",
            'thanks': "You're welcome! Is there anything else I can help you with?",
            'reschedule_booking': "Yes, you can reschedule your booking through the Blys app. Would you like me to assist you?",
            'cancel_booking': "I can help you cancel your booking. Please confirm if you'd like to proceed with the cancellation?",
            'pricing_inquiry': "Our massage services start from $80 for a 60-minute session. Prices may vary based on location and specific treatments. Would you like more detailed pricing information?",
            'book_service': "I'd be happy to help you book a service! What type of massage would you like and when would you prefer?",
            'booking_status': "Let me check your booking status. Could you please provide your booking reference or the phone number associated with your account?",
            'confirm': self.handle_confirmation(state, text),
            'deny': "No problem! Is there anything else I can help you with today?",
            'provide_datetime': self.handle_datetime(state, text),
        }
        
        return responses.get(intent, "I'm sorry, I didn't understand that. Could you please rephrase your question?")
    
    def handle_confirmation(self, state, text):
        if state['context'] == 'reschedule':
            state['awaiting'] = 'datetime'
            return "Great! Please provide the new date and time you'd like to reschedule your booking to."
        elif state['context'] == 'cancel':
            state['context'] = None
            return "Your booking cancellation request has been processed. You'll receive a confirmation email shortly."
        else:
            return "Thank you for confirming!"
    
    def handle_datetime(self, state, text):
        datetime_text, pattern_type = self.extract_datetime(text)
        
        if datetime_text:
            if state['context'] == 'reschedule' or state['awaiting'] == 'datetime':
                state['context'] = None
                state['awaiting'] = None
                return f"Perfect! I've sent your reschedule request for {datetime_text} to your service provider. You'll receive a notification once it's confirmed."
            else:
                return f"I've noted the time {datetime_text}. How would you like to proceed?"
        else:
            return "I couldn't understand the date and time format. Could you please provide it in a format like '30 Mar 2025 10 am' or 'tomorrow at 2 PM'?"
    
    def chat(self, user_input, user_id='default'):
        intent, confidence = self.predict_intent(user_input)
        
        if intent == 'reschedule_booking':
            self.conversation_state[user_id] = {'context': 'reschedule', 'awaiting': None}
        elif intent == 'cancel_booking':
            self.conversation_state[user_id] = {'context': 'cancel', 'awaiting': None}
        
        response = self.generate_response(intent, user_input, user_id)
        
        return {
            'response': response,
            'intent': intent,
            'confidence': confidence
        }
    
    def save_model(self, filepath):
        model_data = {
            'model_state_dict': self.model.state_dict() if self.model else None,
            'tokenizer': self.tokenizer,
            'label_encoder': self.label_encoder,
            'reverse_label_encoder': self.reverse_label_encoder
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {filepath}")
    
    def load_model(self, filepath):
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        self.tokenizer = model_data['tokenizer']
        self.label_encoder = model_data['label_encoder']
        self.reverse_label_encoder = model_data['reverse_label_encoder']
        
        if model_data['model_state_dict']:
            num_labels = len(self.label_encoder)
            self.model = DistilBertForSequenceClassification.from_pretrained(
                'distilbert-base-uncased',
                num_labels=num_labels
            )
            self.model.load_state_dict(model_data['model_state_dict'])
            self.model.eval()
        
        print(f"Model loaded from {filepath}")

In [29]:
if __name__ == "__main__":
    chatbot = BookingChatbot()
    
    # Train the model
    chatbot.train_model('training_data.json')
    
    # Save the trained model
    chatbot.save_model('chatbot_model.pkl')
    
    print("\nChatbot is ready! Testing with example scenarios...")
    
    # Test scenarios
    test_queries = [
        "Can I reschedule my booking?",
        "Yes",
        "30 Mar 2025 10 am",
        "How much does a massage cost?",
        "I want to cancel my appointment"
    ]
    
    for query in test_queries:
        result = chatbot.chat(query)
        print(f"\nUser: {query}")
        print(f"Bot: {result['response']}")
        print(f"Intent: {result['intent']} (confidence: {result['confidence']:.3f})")

Loading training data...
Initializing DistilBERT...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Splitting data...
Creating datasets...
Training model with early stopping...




Epoch,Training Loss,Validation Loss
1,2.3001,2.298853
2,2.2838,2.271428
3,2.2084,2.17652
4,2.0684,1.951808
5,1.7175,1.657683
6,1.4002,1.312106
7,1.0659,1.024837
8,0.6536,0.719701
9,0.3978,0.475526
10,0.1803,0.312435


Training Progress:  75%|███████▌  | 15/20 [14:12<04:44, 56.85s/epoch, epoch=15/20]



Training completed after 15.0 epochs
Evaluating model...




Evaluating: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it]


Validation Accuracy: 0.9375
Model saved to chatbot_model.pkl

Chatbot is ready! Testing with example scenarios...

User: Can I reschedule my booking?
Bot: Yes, you can reschedule your booking through the Blys app. Would you like me to assist you?
Intent: reschedule_booking (confidence: 0.967)

User: Yes
Bot: Great! Please provide the new date and time you'd like to reschedule your booking to.
Intent: confirm (confidence: 0.966)

User: 30 Mar 2025 10 am
Bot: Perfect! I've sent your reschedule request for 30 mar 2025 10 am to your service provider. You'll receive a notification once it's confirmed.
Intent: provide_datetime (confidence: 0.976)

User: How much does a massage cost?
Bot: Our massage services start from $80 for a 60-minute session. Prices may vary based on location and specific treatments. Would you like more detailed pricing information?
Intent: pricing_inquiry (confidence: 0.975)

User: I want to cancel my appointment
Bot: I can help you cancel your booking. Please confirm 

# Inferencing the Model

In [31]:
# Comprehensive test with expanded test queries
test_queries = [
    # Greetings
    "Hello",
    "Hi there",
    "Good morning",
    "Hey",
    
    # Reschedule booking scenarios
    "Can I reschedule my booking?",
    "I need to change my appointment time",
    "Can we move my massage to another day?",
    "I want to reschedule my session",
    "I'm running late, can I change my time?",
    "My plans changed, need to reschedule",
    
    # Confirmations and responses
    "Yes",
    "Yes please",
    "Sure",
    "Absolutely",
    "No",
    "No thanks",
    "Not really",
    
    # Date/time inputs
    "30 Mar 2025 10 am",
    "Tomorrow at 2 PM", 
    "Next Monday 9 am",
    "15 April 2025 3 pm",
    "This Friday at 4 PM",
    
    # Pricing inquiries
    "How much does a massage cost?",
    "What are your prices?",
    "How much do you charge?",
    "What's the cost of a 60 minute massage?",
    "Tell me about your pricing",
    "Are there any discounts?",
    
    # Booking new services
    "I want to book a massage",
    "Can I schedule an appointment?",
    "I'd like to make a reservation",
    "Book me a relaxation massage",
    "I need to schedule a deep tissue massage",
    "I'd like to book for next weekend",
    "What time slots are available?",
    "Do you have availability this week?",
    
    # Cancellation requests  
    "I want to cancel my appointment",
    "Cancel my booking please",
    "I need to cancel my massage",
    "Please cancel my reservation",
    
    # Booking status inquiries
    "What's the status of my booking?",
    "Is my appointment confirmed?",
    "When is my massage scheduled?",
    "Can you check my appointment?",
    
    # Thanks
    "Thank you",
    "Much appreciated",
    
    # Additional variations and edge cases
    "What types of massage do you offer?",
    "I have a gift voucher to use",
    "Can I book multiple sessions?",
    "What's your cancellation policy?",
    "Do you offer home service?",
    "Can I change the therapist?",
    "I need to update my contact details",
    "How long before my appointment should I arrive?",
]

print(f"Testing chatbot with {len(test_queries)} comprehensive test queries:")
print("=" * 80)

for i, query in enumerate(test_queries, 1):
    result = chatbot.chat(query)
    print(f"{i:2d}. User: {query}")
    print(f"    Bot: {result['response']}")
    print(f"    Intent: {result['intent']} (confidence: {result['confidence']:.3f})")
    print("-" * 80)

Testing chatbot with 56 comprehensive test queries:
 1. User: Hello
    Bot: Hello! I'm here to help you with your booking needs. How can I assist you today?
    Intent: greeting (confidence: 0.974)
--------------------------------------------------------------------------------
 2. User: Hi there
    Bot: Hello! I'm here to help you with your booking needs. How can I assist you today?
    Intent: greeting (confidence: 0.974)
--------------------------------------------------------------------------------
 3. User: Good morning
    Bot: Hello! I'm here to help you with your booking needs. How can I assist you today?
    Intent: greeting (confidence: 0.974)
--------------------------------------------------------------------------------
 4. User: Hey
    Bot: Hello! I'm here to help you with your booking needs. How can I assist you today?
    Intent: greeting (confidence: 0.973)
--------------------------------------------------------------------------------
 5. User: Can I reschedule m