In [1]:
import json

# Load the dataset
with open('normalised_intent_validation_slotfixed_set.json', 'r') as file: 
    data = json.load(file)

utterances_validate = []
intents_validate = []

for item in data:
    for scenario in item['scenarios']:
        for turn in scenario['turns']:
            if turn["speaker"] == "USER":
                for frame in turn.get("frames", []):
                    service = frame.get("service")
                    active_intent = frame["state"]["active_intent"]
                    
                    # Check if the service is "hotel" or "train" and intent is not "NONE"
                    if service in ["hotel", "train"] and active_intent != "none":
                        utterances_validate.append(turn["utterance"])
                        intents_validate.append(active_intent)

print(f"Total utterances: {len(utterances_validate)}")
print(f"Sample utterances: {utterances_validate[:5]}")
print(f"Sample intents: {intents_validate[:5]}")


Total utterances: 117
Sample utterances: ['Can you tell me when my train departs and if there are any luggage storage options?', 'Can you tell me the train schedule and any nearby attractions?', 'Actually, could you also tell me how often the trains run?', 'Can you tell me when the next train to London leaves?', 'Actually, could you tell me how long the journey to London will take?']
Sample intents: ['find_train', 'find_train', 'find_train', 'find_train', 'find_train']


In [2]:
print(set(intents_validate))

{'find_train', 'book_hotel', 'book_train', 'find_hotel'}


In [3]:
import json

# Load the dataset
with open('normalised_intent_test_slotfixed_set.json', 'r') as file: 
    data = json.load(file)

utterances_test = []
intents_test = []

for item in data:
    for scenario in item['scenarios']:
        for turn in scenario['turns']:
            if turn["speaker"] == "USER":
                for frame in turn.get("frames", []):
                    service = frame.get("service")
                    active_intent = frame["state"]["active_intent"]
                    
                    # Check if the service is "hotel" or "train" and intent is not "NONE"
                    if service in ["hotel", "train"] and active_intent != "none":
                        utterances_test.append(turn["utterance"])
                        intents_test.append(active_intent)

print(f"Total utterances: {len(utterances_test)}")
print(f"Sample utterances: {utterances_test[:5]}")
print(f"Sample intents: {intents_test[:5]}")


Total utterances: 310
Sample utterances: ['Can you tell me when the next train arrives and if there are any delays?', 'Thank you! How often do trains run to this destination?', 'Great, can I buy a ticket for the next train at the station?', 'Can you tell me when the next stop is?', 'Yes, I would like to buy a ticket to Green Park.']
Sample intents: ['find_train', 'find_train', 'book_train', 'find_train', 'book_train']


In [4]:
print(set(intents_test))

{'find_train', 'book_hotel', 'book_train', 'find_hotel'}


In [5]:
import json

# Load the dataset
with open('normalised_intent_train_slotfixed_set.json', 'r') as file: 
    data = json.load(file)

utterances_train = []
intents_train = []

for item in data:
    for scenario in item['scenarios']:
        for turn in scenario['turns']:
            if turn["speaker"] == "USER":
                for frame in turn.get("frames", []):
                    service = frame.get("service")
                    active_intent = frame["state"]["active_intent"]
                    
                    # Check if the service is "hotel" or "train" and intent is not "NONE"
                    if service in ["hotel", "train"] and active_intent != "none":
                        utterances_train.append(turn["utterance"])
                        intents_train.append(active_intent)

print(f"Total utterances: {len(utterances_train)}")
print(f"Sample utterances: {utterances_train[:5]}")
print(f"Sample intents: {intents_train[:5]}")


Total utterances: 1762
Sample utterances: ['Can you tell me the train schedule and how much the tickets cost?', 'Yes, I would like to purchase a ticket for the 3 PM train. Can you also tell me about the upcoming stops?', "No, that's all I needed. Thank you for your help!", 'Actually, I just remembered, can I add a return ticket as well?', 'Can you tell me the train schedule and how to get to the nearest station exit?']
Sample intents: ['find_train', 'book_train', 'book_train', 'book_train', 'find_hotel']


In [6]:
print(set(intents_train))

{'find_train', 'book_hotel', 'book_train', 'find_hotel'}


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import make_scorer, f1_score, precision_score
from sklearn.model_selection import ParameterGrid
import numpy as np

In [9]:
f1_scorer = make_scorer(f1_score, average='micro')

class_weights = {'book_hotel': 0.4, 'book_train': 0.45,'find_hotel': 0.45,'find_train': 0.3 }  # Custom weights reducing the "train" impact


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC(kernel='linear', probability=True, random_state=42))
])
parameters = ParameterGrid({
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3, 5],
    'tfidf__use_idf': [True, False],
    'tfidf__norm': ['l1', 'l2', None],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_features': [None, 5000, 10000, 20000],

    'svc__C': [0.0001, 0.001,0.01, 0.1],  # Adjusted for direct use
    'svc__kernel': ['linear','rbf', 'poly'],
    'svc__degree': [2, 3, 4],  # Only relevant for 'poly' kernel.
    'svc__gamma': ['scale', 'auto', 0.1, 1, 10],  # Kernel coefficient
    'svc__class_weight': [None, 'balanced',class_weights]
})

best_score = 0
best_params = {}

In [173]:
N = 5  # Number of epochs
for epoch in range(1, N+1): 
    print(f"Epoch {epoch}/{N}")
    
    # subset of parameters to try in epoch
    sampled_parameters = np.random.choice(list(parameters), replace=False, size=2) 
    
    for params in sampled_parameters:
        # Setup pipeline with the current parameters
        pipeline.set_params(**params)
        
        pipeline.fit(utterances_train, intents_train)
        
        # Evaluate on the validation set
        current_score = f1_score(intents_validate, pipeline.predict(utterances_validate), average='micro')
        
        # Update best score and parameters if current model is better
        if current_score > best_score:
            best_score = current_score
            best_params = params
            print(f"New best score: {best_score:.4f} with params: {params}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
New best score: 0.5768 with params: {'svc__C': 0.1, 'svc__class_weight': 'balanced', 'svc__degree': 3, 'svc__gamma': 1, 'svc__kernel': 'linear', 'tfidf__max_df': 1.0, 'tfidf__max_features': 10000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50


In [10]:
#best_params = {'svc__C': 0.1, 'svc__class_weight': 'balanced', 'svc__degree': 3, 'svc__gamma': 1, 'svc__kernel': 'linear', 'tfidf__max_df': 1.0, 'tfidf__max_features': 10000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}
pipeline.set_params(**best_params)

In [11]:
from sklearn.utils import shuffle

# Combining training and validation sets
X_combined = utterances_train + utterances_validate
y_combined_bin = np.concatenate((intents_train, intents_validate), axis=0)

# Ensuring data is properly shuffled
X_combined, y_combined_bin = shuffle(X_combined, y_combined_bin, random_state=42)

# Transform combined data
X_combined_tfidf = pipeline.named_steps['tfidf'].fit_transform(X_combined)

# Retrain
pipeline.named_steps['svc'].fit(X_combined_tfidf, y_combined_bin)

In [27]:
import joblib
#pipeline = joblib.load("saved_models\\finalized_model_intents_SVM.joblib")

In [12]:
# Predict and evaluate on the test set
y_test_pred = pipeline.predict(utterances_test)
print("Test Set Evaluation:")
print(classification_report(intents_test, y_test_pred))
print("Test Accuracy:", accuracy_score(intents_test, y_test_pred))

Test Set Evaluation:
              precision    recall  f1-score   support

  book_hotel       0.70      0.52      0.60        88
  book_train       0.38      0.83      0.53        12
  find_hotel       0.45      0.56      0.50        64
  find_train       0.97      0.92      0.94       146

    accuracy                           0.73       310
   macro avg       0.63      0.71      0.64       310
weighted avg       0.76      0.73      0.74       310

Test Accuracy: 0.7290322580645161


In [13]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import numpy as np


conf_matrix = confusion_matrix(intents_test, y_test_pred)

# Normalize the confusion matrix to show percentages
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
conf_matrix_percentage = conf_matrix_normalized * 100  # Convert fractions to percentages


labels = sorted(set(intents_test))


annotations = np.around(conf_matrix_percentage, decimals=2).astype(str)
annotations = np.char.add(annotations, '%')


fig = ff.create_annotated_heatmap(
    z=conf_matrix_percentage, 
    x=labels, 
    y=labels, 
    colorscale='Viridis',
    annotation_text=annotations,
    showscale=True
)


fig.update_layout(
    title='Confusion Matrix (Percentages)',
    xaxis=dict(title='Predicted Label'),
    yaxis=dict(title='True Label')
)


fig.show()


In [14]:
import joblib

# Save the model as a checkpoint
model_filename = 'saved_models_synth\\finalized_model_intents_SVM.joblib'
joblib.dump(pipeline, model_filename)


print(f"Model saved as {model_filename}")


Model saved as saved_models_synth\finalized_model_intents_SVM.joblib


In [15]:
from sklearn.naive_bayes import MultinomialNB


pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

parameters = ParameterGrid({
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3, 5],
    'tfidf__use_idf': [True, False],
    'tfidf__max_features': [None, 5000, 10000, 20000],
    'tfidf__norm': ['l1', 'l2', None],
    'tfidf__sublinear_tf': [True, False],
    
    'nb__alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0],
    'nb__fit_prior': [True, False]
})

In [19]:
best_score = 0
best_params = {}

for epoch in range(1, N+1):
    print(f"Epoch {epoch}/{N}")
    
    sampled_parameters = np.random.choice(list(parameters), replace=False, size=2)
    
    for params in sampled_parameters:
        pipeline_nb.set_params(**params)
        pipeline_nb.fit(utterances_train, intents_train)
        
        current_score = f1_score(intents_validate, pipeline_nb.predict(utterances_validate), average='micro')
        
        if current_score > best_score:
            best_score = current_score
            best_params = params
            print(f"New best score: {best_score:.4f} with params: {params}")

print("Best parameters found:", best_params)
print("Best score achieved:", best_score)

Epoch 1/20
New best score: 0.4199 with params: {'nb__alpha': 100.0, 'nb__fit_prior': False, 'tfidf__max_df': 0.75, 'tfidf__max_features': 20000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 3), 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': False}
New best score: 0.5229 with params: {'nb__alpha': 0.001, 'nb__fit_prior': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 20000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l1', 'tfidf__sublinear_tf': False, 'tfidf__use_idf': False}
Epoch 2/20
Epoch 3/20
New best score: 0.5343 with params: {'nb__alpha': 0.5, 'nb__fit_prior': False, 'tfidf__max_df': 0.5, 'tfidf__max_features': 10000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 3), 'tfidf__norm': None, 'tfidf__sublinear_tf': True, 'tfidf__use_idf': False}
Epoch 4/20
New best score: 0.5425 with params: {'nb__alpha': 10.0, 'nb__fit_prior': True, 'tfidf__max_df': 1.0, 'tfidf__max_features': None, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 3), 'tf

In [21]:
# Apply the best parameters
pipeline_nb.set_params(**best_params)

In [22]:
from sklearn.utils import shuffle

# Combining training and validation sets
X_combined = utterances_train + utterances_validate
y_combined_bin = np.concatenate((intents_train, intents_validate), axis=0)

# Ensuring data is properly shuffled
X_combined, y_combined_bin = shuffle(X_combined, y_combined_bin, random_state=42)

# Transform combined data
X_combined_tfidf = pipeline_nb.named_steps['tfidf'].fit_transform(X_combined)

# Retrain
pipeline_nb.named_steps['nb'].fit(X_combined_tfidf, y_combined_bin)

In [23]:
# Test set evaluation
y_test_preds = pipeline_nb.predict(utterances_test)
print("Test Set Evaluation:")
print(classification_report(intents_test, y_test_preds))
print("Test Accuracy:", accuracy_score(intents_test, y_test_preds))

Test Set Evaluation:
              precision    recall  f1-score   support

  book_hotel       0.75      0.88      0.81        88
  book_train       0.62      0.67      0.64        12
  find_hotel       0.78      0.55      0.64        64
  find_train       0.94      0.97      0.95       146

    accuracy                           0.84       310
   macro avg       0.77      0.76      0.76       310
weighted avg       0.84      0.84      0.84       310

Test Accuracy: 0.8419354838709677


In [24]:
conf_matrix = confusion_matrix(intents_test, y_test_preds)

# Normalize the confusion matrix
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]


conf_matrix_percentage = np.round(conf_matrix_normalized * 100, 2)


labels = sorted(set(intents_test))


fig = ff.create_annotated_heatmap(
    z=conf_matrix_percentage, 
    x=labels, 
    y=labels, 
    colorscale='Viridis',
    annotation_text=conf_matrix_percentage.astype(str),
    showscale=True
)


fig.update_layout(
    title='Confusion Matrix (Percentages)',
    xaxis=dict(title='Predicted Label'),
    yaxis=dict(title='True Label')
)


fig.show()

In [25]:
# Save the model as a checkpoint
model_filename = 'saved_models_synth\\finalized_model_intents_NB.joblib'
joblib.dump(pipeline_nb, model_filename)


print(f"Model saved as {model_filename}")

Model saved as saved_models_synth\finalized_model_intents_NB.joblib
