In [1]:
import json

# Load the dataset
with open('normalised_intent_validation_slotfixed_set.json', 'r') as file:
    data = json.load(file)

utterances_validate = []
slot_types_validate = []

desired_classes = {'hotel-name', 'hotel-type', 'hotel-bookday', 'hotel-internet', 'hotel-bookstay', 'hotel-area', 'hotel-parking', 'hotel-bookpeople', 'hotel-pricerange', 'train-departure', 'train-arriveby', 'train-destination', 'train-day', 'train-bookpeople', 'train-leaveat'}

# Navigate through the nested structure
for item in data:
    for scenario in item['scenarios']:
        for turn in scenario['turns']:
            if turn["speaker"] == "USER":
                current_slots = set()
                for frame in turn.get("frames", []):
                    # Filter slot values to include only those in the desired classes
                    current_slots.update(slot for slot in frame.get("state", {}).get("slot_values", {}).keys() if slot in desired_classes)
                if current_slots:  # only if slot_values are present
                    utterances_validate.append(turn["utterance"])
                    slot_types_validate.append(list(current_slots))

print(f"Total utterances: {len(utterances_validate)}")
print(f"Sample utterances: {utterances_validate[:5]}")
print(f"Sample slot_types: {slot_types_validate[:5]}")


Total utterances: 75
Sample utterances: ['Can you tell me when my train departs and if there are any luggage storage options?', 'I am looking to travel from Cambridge to London. What are the schedules and how much do the tickets cost?', 'Yes, I would like to book a ticket for this Thursday. Do I qualify for any discounts?', 'Yes, I am a student. Can I get the student discount for my ticket?', 'Yes, please book the ticket for me for this Thursday with the student discount. Can you also tell me what time the train leaves?']
Sample slot_types: [['train-leaveat'], ['train-destination', 'train-departure'], ['train-day', 'train-departure'], ['train-day', 'train-departure'], ['train-day', 'train-departure']]


In [2]:
import json

# Load the dataset
with open('normalised_intent_test_slotfixed_set.json', 'r') as file:
    data = json.load(file)

utterances_test = []
slot_types_test = []


for item in data:
    for scenario in item['scenarios']:
        for turn in scenario['turns']:
            if turn["speaker"] == "USER":
                current_slots = set()
                for frame in turn.get("frames", []):
                    # Filter slot values to include only those in the desired classes
                    current_slots.update(slot for slot in frame.get("state", {}).get("slot_values", {}).keys() if slot in desired_classes)
                if current_slots:  # only if slot_values are present
                    utterances_test.append(turn["utterance"])
                    slot_types_test.append(list(current_slots))

print(f"Total utterances: {len(utterances_test)}")
print(f"Sample utterances: {utterances_test[:5]}")
print(f"Sample slot_types: {slot_types_test[:5]}")


Total utterances: 259
Sample utterances: ['No, I already have a ticket. Thanks for the information!', "No, that's all I needed. Thank you!", 'Actually, could you tell me how long the journey will take?', 'Can you tell me when the next stop is? And could I get some help with my luggage?', "Yes, I'm trying to get to Brighton. What time do we arrive?"]
Sample slot_types: [['train-leaveat'], ['train-leaveat'], ['train-leaveat'], ['train-departure'], ['train-destination']]


In [3]:
unique_slot_types = set()
for slots_list in slot_types_test:
    unique_slot_types.update(slots_list)

print("All unique slot types:", unique_slot_types)

All unique slot types: {'hotel-name', 'train-destination', 'hotel-internet', 'train-arriveby', 'train-day', 'hotel-area', 'hotel-type', 'hotel-bookpeople', 'train-departure', 'hotel-pricerange', 'hotel-parking', 'hotel-bookstay', 'train-leaveat', 'hotel-bookday'}


In [4]:
import json

# Load the dataset
with open('normalised_intent_train_slotfixed_set.json', 'r') as file:
    data = json.load(file)

utterances_train = []
slot_types_train = []

for item in data:
    for scenario in item['scenarios']:
        for turn in scenario['turns']:
            if turn["speaker"] == "USER":
                current_slots = set()
                for frame in turn.get("frames", []):
                    # Filter slot values to include only those in the desired classes
                    current_slots.update(slot for slot in frame.get("state", {}).get("slot_values", {}).keys() if slot in desired_classes)
                if current_slots:  # only if slot_values are present
                    utterances_train.append(turn["utterance"])
                    slot_types_train.append(list(current_slots))

print(f"Total utterances: {len(utterances_train)}")
print(f"Sample utterances: {utterances_train[:5]}")
print(f"Sample slot_types: {slot_types_train[:5]}")


Total utterances: 1209
Sample utterances: ["I'm looking to travel to Chicago on the 5th of next month.", "I'll take the 1:00 PM train in Economy class, please.", 'Yes, I would like to proceed with the payment.', 'Can you tell me when the next stop is and where I can find the nearest restroom?', 'Can you tell me how to get to the nearest station and help me with the transfer?']
Sample slot_types: [['train-day', 'train-destination'], ['train-day', 'train-leaveat'], ['train-day', 'train-leaveat'], ['train-departure'], ['train-departure']]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import classification_report,jaccard_score,hamming_loss


In [7]:
mlb = MultiLabelBinarizer()
slot_types_train_bin = mlb.fit_transform(slot_types_train)
slot_types_test_bin = mlb.transform(slot_types_test)
slot_types_validate_bin = mlb.transform(slot_types_validate)
f1_scorer = make_scorer(f1_score, average='micro')


In [8]:
from sklearn.model_selection import ParameterGrid
import numpy as np


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(SVC(kernel='linear', probability=True, random_state=42)))
])
parameters = ParameterGrid({
    # TF-IDF Vectorizer
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],  
    'tfidf__max_df': [0.5, 0.75, 1.0],  
    'tfidf__min_df': [1, 2, 3, 5],  
    'tfidf__use_idf': [True, False],  
    'tfidf__norm': ['l1', 'l2', None],  
    'tfidf__sublinear_tf': [True, False], 
    'tfidf__max_features': [None, 5000, 10000, 20000],

    # SVM Classifier
    'clf__estimator__C': [0.01, 0.1, 1, 10, 100],  
    'clf__estimator__kernel': ['linear', 'rbf', 'poly'],  
    'clf__estimator__degree': [2, 3, 4],  
    'clf__estimator__gamma': ['scale', 'auto', 0.1, 1, 10], 
    'clf__estimator__class_weight': [None, 'balanced'],  
})

best_score = 0
best_params = {}


In [9]:
N = 10  
for epoch in range(1, N+1): 
    print(f"Epoch {epoch}/{N}")
    
    # subset of parameters to try in epoch
    sampled_parameters = np.random.choice(list(parameters), replace=False, size=2) 
    
    for params in sampled_parameters:
       
        pipeline.set_params(**params)
        
        pipeline.fit(utterances_train, slot_types_train_bin)
        
        current_score = f1_score(slot_types_validate_bin, pipeline.predict(utterances_validate), average='micro')
        
        # Update best score and parameters if current model is better
        if current_score > best_score:
            best_score = current_score
            best_params = params
            print(f"New best score: {best_score:.4f} with params: {params}")


Epoch 1/10
New best score: 0.3075 with params: {'clf__estimator__C': 0.01, 'clf__estimator__class_weight': 'balanced', 'clf__estimator__degree': 4, 'clf__estimator__gamma': 0.1, 'clf__estimator__kernel': 'rbf', 'tfidf__max_df': 1.0, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
Epoch 2/10
Epoch 3/10
Epoch 4/10
New best score: 0.3948 with params: {'clf__estimator__C': 0.1, 'clf__estimator__class_weight': 'balanced', 'clf__estimator__degree': 4, 'clf__estimator__gamma': 'auto', 'clf__estimator__kernel': 'poly', 'tfidf__max_df': 1.0, 'tfidf__max_features': None, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 3), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:

pipeline.set_params(**best_params)


In [11]:
from sklearn.utils import shuffle

# Combining training and validation sets
X_combined = utterances_train + utterances_validate
y_combined_bin = np.concatenate((slot_types_train_bin, slot_types_validate_bin), axis=0)

# Ensuring data is properly shuffled
X_combined, y_combined_bin = shuffle(X_combined, y_combined_bin, random_state=42)

# Transform combined data
X_combined_tfidf = pipeline.named_steps['tfidf'].fit_transform(X_combined)

# Retrain
pipeline.named_steps['clf'].fit(X_combined_tfidf, y_combined_bin)

In [12]:
# Predict on the test set
y_pred_test_bin = pipeline.predict(utterances_test)

y_pred_test = mlb.inverse_transform(y_pred_test_bin)
slot_types_test_actual = mlb.inverse_transform(slot_types_test_bin)

# Evaluate performance
print("Test Set Results:")
print(classification_report(slot_types_test_bin, y_pred_test_bin, target_names=mlb.classes_))

hamming_loss_value = hamming_loss(slot_types_test_bin, y_pred_test_bin)
print("Hamming Loss:", hamming_loss_value)

jaccard = jaccard_score(slot_types_test_bin, y_pred_test_bin, average='samples')  # For multilabel classification
print("Jaccard Score:", jaccard)

Test Set Results:
                   precision    recall  f1-score   support

       hotel-area       0.08      1.00      0.15        21
    hotel-bookday       0.05      1.00      0.09        12
 hotel-bookpeople       0.00      0.00      0.00        42
   hotel-bookstay       0.00      0.00      0.00       102
   hotel-internet       0.01      1.00      0.02         3
       hotel-name       0.40      1.00      0.57       103
    hotel-parking       0.00      0.00      0.00         9
 hotel-pricerange       0.00      0.00      0.00        49
       hotel-type       0.20      1.00      0.33        51
   train-arriveby       0.03      1.00      0.07         9
 train-bookpeople       0.00      0.00      0.00         0
        train-day       0.02      1.00      0.03         4
  train-departure       0.00      0.00      0.00         8
train-destination       0.00      0.00      0.00         7
    train-leaveat       0.06      1.00      0.11        15

        micro avg       0.11      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import joblib

# Save the model as a checkpoint
model_filename = 'saved_models_synth\\finalized_model_slot_values_SVM.joblib'
joblib.dump(pipeline, model_filename)

# Save the MultiLabelBinarizer
mlb_filename = 'saved_models_synth\\mlb_slot_values_SVM.joblib'
joblib.dump(mlb, mlb_filename)

print(f"Model saved as {model_filename}")
print(f"Label binarizer saved as {mlb_filename}")


Model saved as saved_models_synth\finalized_model_slot_values_SVM.joblib
Label binarizer saved as saved_models_synth\mlb_slot_values_SVM.joblib


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, make_scorer

mlb = MultiLabelBinarizer()
slot_types_train_bin = mlb.fit_transform(slot_types_train)
slot_types_test_bin = mlb.transform(slot_types_test)
slot_types_validate_bin = mlb.transform(slot_types_validate)

# Define the pipeline
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(MultinomialNB()))
])

# Define hyperparameters to tune
parameters = ParameterGrid({
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3, 5],
    'tfidf__use_idf': [True, False],
    'tfidf__norm': ['l1', 'l2', None],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_features': [None, 5000, 10000, 20000],
    
    'clf__estimator__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
})

f1_scorer = make_scorer(f1_score, average='micro')


In [None]:
best_score = 0
best_params = None


for epoch in range(1, N+1): 
    print(f"Epoch {epoch}/{N}")
    
    # subset of parameters to try in epoch
    sampled_parameters = np.random.choice(list(parameters), replace=False, size=2) 
    
    for params in sampled_parameters:

        pipeline_nb.set_params(**params)
        

        pipeline_nb.fit(utterances_train, slot_types_train_bin)
        

        current_score = f1_score(slot_types_validate_bin, pipeline_nb.predict(utterances_validate), average='micro')
        
        # Update best score and parameters if current model is better
        if current_score > best_score:
            best_score = current_score
            best_params = params
            print(f"New best score: {best_score:.4f} with params: {params}")


Epoch 1/10
New best score: 0.3488 with params: {'clf__estimator__alpha': 100.0, 'tfidf__max_df': 0.75, 'tfidf__max_features': 20000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3), 'tfidf__norm': None, 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
New best score: 0.6346 with params: {'clf__estimator__alpha': 0.1, 'tfidf__max_df': 0.75, 'tfidf__max_features': 5000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': None, 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
Epoch 2/10
New best score: 0.6503 with params: {'clf__estimator__alpha': 0.001, 'tfidf__max_df': 1.0, 'tfidf__max_features': 5000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': None, 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
New best score: 0.6832 with params: {'clf__estimator__alpha': 0.01, 'tfidf__max_df': 0.5, 'tfidf__max_features': 10000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 3), 'tfidf__n

In [None]:
# Configure pipeline with the best parameters
pipeline_nb.set_params(**best_params)

# Combining training and validation sets
X_combined = utterances_train + utterances_validate
y_combined_bin = np.concatenate((slot_types_train_bin, slot_types_validate_bin), axis=0)

# Ensuring data is properly shuffled
X_combined, y_combined_bin = shuffle(X_combined, y_combined_bin, random_state=42)

# Transform combined data
X_combined_tfidf = pipeline_nb.named_steps['tfidf'].fit_transform(X_combined)

# Retrain
pipeline_nb.named_steps['clf'].fit(X_combined_tfidf, y_combined_bin)




In [None]:
# Predict on the test set
y_pred_test_bin = pipeline_nb.predict(utterances_test)

y_pred_test = mlb.inverse_transform(y_pred_test_bin)
slot_types_test_actual = mlb.inverse_transform(slot_types_test_bin)

# Evaluate performance
print("Final Test Set Results:")
print(classification_report(slot_types_test_bin, y_pred_test_bin, target_names=mlb.classes_))

hamming_loss_value = hamming_loss(slot_types_test_bin, y_pred_test_bin)
print("Hamming Loss:", hamming_loss_value)

jaccard = jaccard_score(slot_types_test_bin, y_pred_test_bin, average='samples')  # For multilabel classification
print("Jaccard Score:", jaccard)


Final Test Set Results:
                   precision    recall  f1-score   support

       hotel-area       0.37      1.00      0.54        21
    hotel-bookday       0.39      1.00      0.56        12
 hotel-bookpeople       0.33      0.57      0.42        42
   hotel-bookstay       0.62      0.82      0.71       102
   hotel-internet       0.06      1.00      0.12         3
       hotel-name       0.69      0.71      0.70       103
    hotel-parking       0.23      0.89      0.36         9
 hotel-pricerange       0.47      0.88      0.61        49
       hotel-type       0.36      0.53      0.43        51
   train-arriveby       0.56      0.62      0.59         8
 train-bookpeople       0.00      0.00      0.00         0
        train-day       0.43      0.75      0.55         4
  train-departure       0.33      0.75      0.46         8
train-destination       0.22      0.50      0.31         4
    train-leaveat       0.52      0.80      0.63        15

        micro avg       0.44  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Save the model as a checkpoint
model_filename = 'saved_models_synth\\finalized_model_slot_values_NB.joblib'
joblib.dump(pipeline, model_filename)

# Save the MultiLabelBinarizer
mlb_filename = 'saved_models_synth\\mlb_slot_values_NB.joblib'
joblib.dump(mlb, mlb_filename)

print(f"Model saved as {model_filename}")
print(f"Label binarizer saved as {mlb_filename}")

Model saved as saved_models_synth\finalized_model_slot_values_NB.joblib
Label binarizer saved as saved_models_synth\mlb_slot_values_NB.joblib
