In [15]:
import json
import numpy as np
import random
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, hinge_loss
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.filterwarnings('ignore')

# Load dataset
with open('Combined_training.json') as file:
    data = json.load(file)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = ''.join([char for char in sentence if char not in string.punctuation])
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Extract patterns and tags
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(preprocess(pattern))
        tags.append(intent['tag'])

# Convert tags to numerical labels
unique_tags = list(set(tags))
tag_to_num = {tag: num for num, tag in enumerate(unique_tags)}
num_to_tag = {num: tag for tag, num in tag_to_num.items()}
y = np.array([tag_to_num[tag] for tag in tags])


In [16]:
# Vectorize patterns using Bag-of-Words
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(patterns)

# Split the dataset into training and validation sets
X_train_count, X_val_count, y_train_count, y_val_count = train_test_split(X_count, y, test_size=0.2, random_state=42)

# Train the model on the training set and measure training time
start_time = time.time()
svm_model_count = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear', probability=True))
svm_model_count.fit(X_train_count, y_train_count)
training_time_count = time.time() - start_time

# Predict on the validation set
y_val_pred_count = svm_model_count.predict(X_val_count)

# Calculate validation accuracy
val_accuracy_count = accuracy_score(y_val_count, y_val_pred_count)

# Calculate hinge loss on training and validation sets
y_train_decision_count = svm_model_count.decision_function(X_train_count)
y_val_decision_count = svm_model_count.decision_function(X_val_count)
train_hinge_loss_count = hinge_loss(y_train_count, y_train_decision_count, labels=list(tag_to_num.values()))
val_hinge_loss_count = hinge_loss(y_val_count, y_val_decision_count, labels=list(tag_to_num.values()))

# Perform K-Fold Cross-Validation
cross_val_scores_count = cross_val_score(svm_model_count, X_count, y, cv=kf, scoring='accuracy')

# Print CountVectorizer model metrics
print("\nCountVectorizer Model Metrics:")
print(f'Training Time: {training_time_count:.2f} seconds')
print(f'Validation Accuracy: {val_accuracy_count:.4f}')
print(f'Train Hinge Loss: {train_hinge_loss_count:.4f}')
print(f'Validation Hinge Loss: {val_hinge_loss_count:.4f}')
print(f'Cross-Validation Scores: {cross_val_scores_count}')
print(f'Mean Cross-Validation Score: {np.mean(cross_val_scores_count):.4f}')



CountVectorizer Model Metrics:
Training Time: 0.01 seconds
Validation Accuracy: 1.0000
Train Hinge Loss: 0.0000
Validation Hinge Loss: 0.0000
Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0000


In [17]:
def get_response_count(user_input):
    user_input = preprocess(user_input)
    X_user = count_vectorizer.transform([user_input])
    tag_prob = svm_model_count.predict_proba(X_user)[0]
    tag_index = np.argmax(tag_prob)
    tag = num_to_tag[tag_index]
    
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

print("Chatbot using CountVectorizer is ready! Type 'quit' to exit.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response_count(user_input)
    print("Bot:", response)


Chatbot using CountVectorizer is ready! Type 'quit' to exit.


You:  hi


Bot: Hi there! What can I help you with?


You:  what is external examiners


Bot: Hello! How can I assist you today?


You:  quit


In [2]:
import json
import numpy as np
import random
import string
import nltk
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import hinge_loss, accuracy_score
from sklearn.model_selection import cross_val_score, KFold

import warnings
warnings.filterwarnings('ignore')

# Load dataset
with open('Combined_training.json') as file:
    data = json.load(file)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = ''.join([char for char in sentence if char not in string.punctuation])
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Extract patterns and tags
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(preprocess(pattern))
        tags.append(intent['tag'])

# Convert tags to numerical labels
unique_tags = list(set(tags))
tag_to_num = {tag: num for num, tag in enumerate(unique_tags)}
num_to_tag = {num: tag for tag, num in tag_to_num.items()}
y = np.array([tag_to_num[tag] for tag in tags])

# Vectorize patterns using Bag-of-Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(patterns)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Define the SVM model
svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(probability=True))

# Define the parameter grid for GridSearchCV
param_grid = {
    'svc__C': [0.01, 0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params_grid = grid_search.best_params_
print(f"Best parameters (Grid Search): {best_params_grid}")

# Train and evaluate the Grid Search best model
svm_model_grid = make_pipeline(
    StandardScaler(with_mean=False), 
    SVC(C=best_params_grid['svc__C'], degree=best_params_grid['svc__degree'], gamma=best_params_grid['svc__gamma'], kernel=best_params_grid['svc__kernel'], probability=True)
)
svm_model_grid.fit(X_train, y_train)

y_val_pred_grid = svm_model_grid.predict(X_val)
val_accuracy_grid = accuracy_score(y_val, y_val_pred_grid)

print(f"Validation Accuracy (Grid Search): {val_accuracy_grid:.4f}")


Best parameters (Grid Search): {'svc__C': 0.1, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Validation Accuracy (Grid Search): 1.0000


In [4]:
# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'svc__C': np.logspace(-2, 2, 10),  # 10 values between 0.01 and 100
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],  # Degrees for polynomial kernel
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]  # Various gamma values
}

# Perform Random Search with Cross-Validation
random_search = RandomizedSearchCV(svm_model, param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best parameters from Random Search
best_params_random = random_search.best_params_
print(f"Best parameters (Random Search): {best_params_random}")

# Train and evaluate the Random Search best model
svm_model_random = make_pipeline(
    StandardScaler(with_mean=False), 
    SVC(C=best_params_random['svc__C'], degree=best_params_random['svc__degree'], gamma=best_params_random['svc__gamma'], kernel=best_params_random['svc__kernel'], probability=True)
)
svm_model_random.fit(X_train, y_train)

y_val_pred_random = svm_model_random.predict(X_val)
val_accuracy_random = accuracy_score(y_val, y_val_pred_random)

print(f"Validation Accuracy (Random Search): {val_accuracy_random:.4f}")


Best parameters (Random Search): {'svc__kernel': 'linear', 'svc__gamma': 10, 'svc__degree': 4, 'svc__C': 35.93813663804626}
Validation Accuracy (Random Search): 1.0000


In [5]:


# Select the best model
if val_accuracy_grid > val_accuracy_random:
    best_model = svm_model_grid
    best_params = best_params_grid
    print("Best model selected from Grid Search.")
else:
    best_model = svm_model_random
    best_params = best_params_random
    print("Best model selected from Random Search.")

# Calculate hinge loss on training and validation sets
y_train_decision = best_model.decision_function(X_train)
y_val_decision = best_model.decision_function(X_val)
train_hinge_loss = hinge_loss(y_train, y_train_decision, labels=list(tag_to_num.values()))
val_hinge_loss = hinge_loss(y_val, y_val_decision, labels=list(tag_to_num.values()))

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy')

# Print metrics
print("\nAverage Metrics:")
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred_grid if best_model == svm_model_grid else y_val_pred_random):.4f}')
print(f'Train Hinge Loss: {train_hinge_loss:.4f}')
print(f'Validation Hinge Loss: {val_hinge_loss:.4f}')
print(f'Cross-Validation Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Score: {np.mean(cross_val_scores):.4f}')


Best model selected from Random Search.

Average Metrics:
Validation Accuracy: 1.0000
Train Hinge Loss: 0.0000
Validation Hinge Loss: 0.0000
Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0000


In [6]:
def get_response(user_input):
    user_input = preprocess(user_input)
    X_user = vectorizer.transform([user_input])
    tag_prob = best_model.predict_proba(X_user)[0]
    tag_index = np.argmax(tag_prob)
    tag = num_to_tag[tag_index]
    
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

print("Chatbot is ready! Type 'quit' to exit.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response(user_input)
    print("Bot:", response)


Chatbot is ready! Type 'quit' to exit.


You:  hi


Bot: Hey! How can I help?


You:  how are you 


Bot: Hello! How can I assist you today?


You:  tell me about places to study


Bot: At Herts, we have a variety of places that you can study from. We haveLearning Resources Centres (LRCs) [https://ask.herts.ac.uk/learning-resources-centres]on both campuses which are open 24/7, 365 days a year and have silent study areas, breakout rooms, group areas, computers and much more for you to use. If you’d like to study in one of our food outlets, visit the Art and Design Gallery Café and Café Rore on College Lane campus, and Café Ambition and Café Sport on de Havilland campus so you can work and get your caffeine fix. There are also other places across the University such as the Chapman Lounge and Hutton Hub on College Lane campus and the mezzanine and Grace Ononiwu Law Court building on de Havilland campus which have plenty of seating for you to use. View our TikTok of study spaces around campus. [https://www.tiktok.com/@uniofherts/video/7171372715234823430?_r=1&_t=8XlCqlMWi3R&dm_i=3CZ%2C84G9B%2C7FAFLN%2CX9JEK%2C1&is_from_webapp=v1&item_id=7171372715234823430]


You:  what is teh best place sto study


Bot: Hi there! What can I help you with?


You:  what is best places to study 


Bot: At Herts, we have a variety of places that you can study from. We haveLearning Resources Centres (LRCs) [https://ask.herts.ac.uk/learning-resources-centres]on both campuses which are open 24/7, 365 days a year and have silent study areas, breakout rooms, group areas, computers and much more for you to use. If you’d like to study in one of our food outlets, visit the Art and Design Gallery Café and Café Rore on College Lane campus, and Café Ambition and Café Sport on de Havilland campus so you can work and get your caffeine fix. There are also other places across the University such as the Chapman Lounge and Hutton Hub on College Lane campus and the mezzanine and Grace Ononiwu Law Court building on de Havilland campus which have plenty of seating for you to use. View our TikTok of study spaces around campus. [https://www.tiktok.com/@uniofherts/video/7171372715234823430?_r=1&_t=8XlCqlMWi3R&dm_i=3CZ%2C84G9B%2C7FAFLN%2CX9JEK%2C1&is_from_webapp=v1&item_id=7171372715234823430]


You:  what is ai tools


Bot: Good day! What do you need help with?


You:  what is the use of ai tools


Bot: Good day! What do you need help with?


You:  quit
