In [13]:
%pip install datasets transformers torch pandas numpy scikit-learn




In [14]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer, 
    TrainingArguments
)
from sklearn.model_selection import train_test_split
import json
import re

In [15]:
dataset = load_dataset("bitext/Bitext-insurance-llm-chatbot-training-dataset")
print(f"Dataset structure: {dataset}")
print(f"Number of examples: {len(dataset['train'])}")
print(f"First example: {dataset['train'][0]}")

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['instruction', 'intent', 'category', 'tags', 'response'],
        num_rows: 39000
    })
})
Number of examples: 39000
First example: {'instruction': "I'd like to see my fucking auto insurance , I need assistance", 'intent': 'information_auto_insurance', 'category': 'AUTO_INSURANCE', 'tags': 'BCLPWZ', 'response': 'To retrieve the details of your auto insurance, please adhere to the following instructions:\n\n1. Access your account by navigating to {{WEBSITE_URL}}.\n2. Proceed to the {{MY_POLICIES_SECTION}} section within your profile.\n3. Select the auto insurance policy that you want to review.\n\nThis process will grant you comprehensive access to your auto insurance information. Should you require any additional support, do not hesitate to contact me.'}


In [16]:
df = pd.DataFrame(dataset['train'])
print(f"Columns: {df.columns.tolist()}")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
print(df.head())

Columns: ['instruction', 'intent', 'category', 'tags', 'response']
Dataset shape: (39000, 5)

First 5 rows:
                                         instruction  \
0  I'd like to see my fucking auto insurance , I ...   
1  wanna know more about my auto insurance giev m...   
2  I'd like to sde my fucking auto insurance coul...   
3  wanna see my fucking auto insurance where coul...   
4  I need information about my fucking auto insur...   

                       intent        category      tags  \
0  information_auto_insurance  AUTO_INSURANCE    BCLPWZ   
1  information_auto_insurance  AUTO_INSURANCE      BCQZ   
2  information_auto_insurance  AUTO_INSURANCE  BCILPQWZ   
3  information_auto_insurance  AUTO_INSURANCE   BCILPQW   
4  information_auto_insurance  AUTO_INSURANCE      BCIW   

                                            response  
0  To retrieve the details of your auto insurance...  
1  To retrieve your auto insurance details, pleas...  
2  To obtain your auto insurance in

In [17]:
intent_counts = df['intent'].value_counts()
print(f"Intent distribution:\n{intent_counts}")

Intent distribution:
intent
information_auto_insurance       1000
accept_settlement                1000
file_claim                       1000
negotiate_settlement             1000
receive_payment                  1000
reject_settlement                1000
track_claim                      1000
appeal_denied_insurance_claim    1000
dispute_invoice                  1000
file_complaint                   1000
agent                            1000
customer_service                 1000
human_agent                      1000
insurance_representative         1000
change_coverage                  1000
check_coverage                   1000
downgrade_coverage               1000
upgrade_coverage                 1000
buy_insurance_policy             1000
cancellation_fees                1000
cancel_insurance_policy          1000
compare_insurance_policies       1000
general_information              1000
information_health_insurance     1000
information_home_insurance       1000
report_incident       

In [18]:
faq_intents = [
    'check_coverage', 'policy_details', 'insurance_types', 
    'deductible_info', 'coverage_limits', 'exclusions'
]

claims_intents = [
    'file_claim', 'claim_status', 'claim_documentation', 
    'claim_process', 'claim_timeline'
]

premium_intents = [
    'get_quote', 'premium_factors', 'discount_eligibility', 
    'payment_options', 'premium_calculation'
]

In [19]:
def categorize_intent(intent):
    if intent in faq_intents:
        return 'FAQ'
    elif intent in claims_intents:
        return 'CLAIMS'
    elif intent in premium_intents:
        return 'PREMIUM'
    else:
        return 'OTHER'

df['category'] = df['intent'].apply(categorize_intent)
print(f"Category distribution:\n{df['category'].value_counts()}")


Category distribution:
category
OTHER     37000
CLAIMS     1000
FAQ        1000
Name: count, dtype: int64


In [20]:
training_examples = []
for _, row in df.iterrows():
    # Create instruction-response pairs
    example = {
        'instruction': row['instruction'],
        'response': row['response'],
        'intent': row['intent'],
        'category': row['category']
    }
    training_examples.append(example)

In [21]:
train_data, val_data = train_test_split(
    training_examples, 
    test_size=0.2, 
    random_state=42,
    stratify=[ex['category'] for ex in training_examples]
)

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")


Training examples: 31200
Validation examples: 7800


In [22]:
with open('train_data.json', 'w') as f:
    json.dump(train_data, f)

with open('val_data.json', 'w') as f:
    json.dump(val_data, f)
