In [33]:
import json
import numpy as np
import random
import string
import nltk
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, hinge_loss
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.filterwarnings('ignore')

# Load dataset
with open('Combined_training.json') as file:
    data = json.load(file)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = ''.join([char for char in sentence if char not in string.punctuation])
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Extract patterns and tags
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(preprocess(pattern))
        tags.append(intent['tag'])

# Convert tags to numerical labels
unique_tags = list(set(tags))
tag_to_num = {tag: num for num, tag in enumerate(unique_tags)}
num_to_tag = {num: tag for tag, num in tag_to_num.items()}
y = np.array([tag_to_num[tag] for tag in tags])


In [34]:
# Vectorize patterns using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(patterns)

# Vectorize patterns using Bag-of-Words
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(patterns)

# Combine TF-IDF and CountVectorizer features
from scipy.sparse import hstack

X_combined = hstack([X_tfidf, X_count])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [35]:
# Train the model on the training set and measure training time
start_time = time.time()
svm_model_combined = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear', probability=True))
svm_model_combined.fit(X_train, y_train)
training_time_combined = time.time() - start_time

# Predict on the validation set
y_val_pred_combined = svm_model_combined.predict(X_val)

# Calculate validation accuracy
val_accuracy_combined = accuracy_score(y_val, y_val_pred_combined)

# Calculate hinge loss on training and validation sets
y_train_decision_combined = svm_model_combined.decision_function(X_train)
y_val_decision_combined = svm_model_combined.decision_function(X_val)
train_hinge_loss_combined = hinge_loss(y_train, y_train_decision_combined, labels=list(tag_to_num.values()))
val_hinge_loss_combined = hinge_loss(y_val, y_val_decision_combined, labels=list(tag_to_num.values()))

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores_combined = cross_val_score(svm_model_combined, X_combined, y, cv=kf, scoring='accuracy')

# Print combined model metrics
print("\nCombined Model Metrics:")
print(f'Training Time: {training_time_combined:.2f} seconds')
print(f'Validation Accuracy: {val_accuracy_combined:.4f}')
print(f'Train Hinge Loss: {train_hinge_loss_combined:.4f}')
print(f'Validation Hinge Loss: {val_hinge_loss_combined:.4f}')
print(f'Cross-Validation Scores: {cross_val_scores_combined}')
print(f'Mean Cross-Validation Score: {np.mean(cross_val_scores_combined):.4f}')



Combined Model Metrics:
Training Time: 0.02 seconds
Validation Accuracy: 1.0000
Train Hinge Loss: 0.0000
Validation Hinge Loss: 0.0000
Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0000


In [36]:
def get_response_combined(user_input):
    user_input = preprocess(user_input)
    X_user_tfidf = tfidf_vectorizer.transform([user_input])
    X_user_count = count_vectorizer.transform([user_input])
    X_user_combined = hstack([X_user_tfidf, X_user_count])
    tag_prob = svm_model_combined.predict_proba(X_user_combined)[0]
    tag_index = np.argmax(tag_prob)
    tag = num_to_tag[tag_index]
    
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

print("Chatbot using Combined Model is ready! Type 'quit' to exit.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response_combined(user_input)
    print("Bot:", response)


Chatbot using Combined Model is ready! Type 'quit' to exit.


You:  hi


Bot: Good day! What do you need help with?


You:  what is the use of ai tools


Bot: Good day! What do you need help with?


You:  quit


In [1]:
import json
import numpy as np
import random
import string
import nltk
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import hinge_loss, accuracy_score
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack

import warnings
warnings.filterwarnings('ignore')

# Load dataset
with open('Combined_training.json') as file:
    data = json.load(file)


In [2]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = ''.join([char for char in sentence if char not in string.punctuation])
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Extract patterns and tags
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(preprocess(pattern))
        tags.append(intent['tag'])

# Convert tags to numerical labels
unique_tags = list(set(tags))
tag_to_num = {tag: num for num, tag in enumerate(unique_tags)}
num_to_tag = {num: tag for tag, num in tag_to_num.items()}
y = np.array([tag_to_num[tag] for tag in tags])


In [3]:
# Vectorize patterns using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(patterns)

# Vectorize patterns using Bag-of-Words
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(patterns)

# Combine TF-IDF and CountVectorizer features
X_combined = hstack([X_tfidf, X_count])


In [4]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [5]:
# Define the SVM model
svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(probability=True))


In [6]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'svc__C': [0.01, 0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params_grid = grid_search.best_params_
print(f"Best parameters (Grid Search): {best_params_grid}")

# Train and evaluate the Grid Search best model
svm_model_grid = make_pipeline(
    StandardScaler(with_mean=False), 
    SVC(C=best_params_grid['svc__C'], degree=best_params_grid['svc__degree'], gamma=best_params_grid['svc__gamma'], kernel=best_params_grid['svc__kernel'], probability=True)
)
svm_model_grid.fit(X_train, y_train)

y_val_pred_grid = svm_model_grid.predict(X_val)
val_accuracy_grid = accuracy_score(y_val, y_val_pred_grid)

print(f"Validation Accuracy (Grid Search): {val_accuracy_grid:.4f}")


Best parameters (Grid Search): {'svc__C': 0.01, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Validation Accuracy (Grid Search): 1.0000


In [7]:
# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'svc__C': np.logspace(-2, 2, 10),  # 10 values between 0.01 and 100
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],  # Degrees for polynomial kernel
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]  # Various gamma values
}

# Perform Random Search with Cross-Validation
random_search = RandomizedSearchCV(svm_model, param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best parameters from Random Search
best_params_random = random_search.best_params_
print(f"Best parameters (Random Search): {best_params_random}")

# Train and evaluate the Random Search best model
svm_model_random = make_pipeline(
    StandardScaler(with_mean=False), 
    SVC(C=best_params_random['svc__C'], degree=best_params_random['svc__degree'], gamma=best_params_random['svc__gamma'], kernel=best_params_random['svc__kernel'], probability=True)
)
svm_model_random.fit(X_train, y_train)

y_val_pred_random = svm_model_random.predict(X_val)
val_accuracy_random = accuracy_score(y_val, y_val_pred_random)

print(f"Validation Accuracy (Random Search): {val_accuracy_random:.4f}")



Best parameters (Random Search): {'svc__kernel': 'linear', 'svc__gamma': 10, 'svc__degree': 4, 'svc__C': 35.93813663804626}
Validation Accuracy (Random Search): 1.0000


In [8]:
# Select the best model
if val_accuracy_grid > val_accuracy_random:
    best_model = svm_model_grid
    best_params = best_params_grid
    print("Best model selected from Grid Search.")
else:
    best_model = svm_model_random
    best_params = best_params_random
    print("Best model selected from Random Search.")


Best model selected from Random Search.


In [9]:
# Calculate hinge loss on training and validation sets
y_train_decision = best_model.decision_function(X_train)
y_val_decision = best_model.decision_function(X_val)
train_hinge_loss = hinge_loss(y_train, y_train_decision, labels=list(tag_to_num.values()))
val_hinge_loss = hinge_loss(y_val, y_val_decision, labels=list(tag_to_num.values()))

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_model, X_combined, y, cv=kf, scoring='accuracy')

# Print metrics
print("\nAverage Metrics:")
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred_grid if best_model == svm_model_grid else y_val_pred_random):.4f}')
print(f'Train Hinge Loss: {train_hinge_loss:.4f}')
print(f'Validation Hinge Loss: {val_hinge_loss:.4f}')
print(f'Cross-Validation Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Score: {np.mean(cross_val_scores):.4f}')



Average Metrics:
Validation Accuracy: 1.0000
Train Hinge Loss: 0.0000
Validation Hinge Loss: 0.0000
Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0000


In [None]:
def get_response(user_input):
    user_input = preprocess(user_input)
    X_user_tfidf = tfidf_vectorizer.transform([user_input])
    X_user_count = count_vectorizer.transform([user_input])
    X_user_combined = hstack([X_user_tfidf, X_user_count])
    tag_prob = best_model.predict_proba(X_user_combined)[0]
    tag_index = np.argmax(tag_prob)
    tag = num_to_tag[tag_index]
    
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

print("Chatbot using Combined Model is ready! Type 'quit' to exit.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response(user_input)
    print("Bot:", response)


Chatbot using Combined Model is ready! Type 'quit' to exit.


You:  hi


Bot: Good day! What do you need help with?


You:  mai i know more on external examiners 


Bot: Hey! How can I help?


You:  may i know more on external examiners


Bot: Good day! What do you need help with?


You:  what is External Examiners


Bot: The role of the External Examiner In the UK Higher Education system, Universities are responsible for the academic standards of the awards that they offer, and for the quality of the education they provide to enable students to meet those standards. The role of the external examiner is an essential part of the University’s quality assurance processes. They essentially externally ‘audit’ the programmes that they are appointed to, in terms of the attainment of academic standards and the quality of the education. The University of Hertfordshire appoints external examiners mainly from other Universities, but also from industry and/or the professions. They are qualified and experienced in the subject, have an understanding of the academic standards required for the award and are also independent of the University of Hertfordshire. As such, they are able to provide carefully considered advice on the academic standards of the programmes and/or modules to which they have been appointed, a

You:  what is mental healthj


Bot: Hi there! What can I help you with?


You:  what is mental health


Bot: Good day! What do you need help with?


You:  Tell me about wellbeing and mental health support


Bot: If your studies are affecting your mental health and wellbeing then pleasecontact our Student Wellbeing team [https://ask.herts.ac.uk/student-wellbeing]for support. You can alsocall our free 24/7 wellbeing helpline [https://ask.herts.ac.uk/student-wellbeing-24-hour-helpline]ran by Health Assured whenever you need to. You can also talk to staff in your School if you’re struggling.


You:  What is Artificial intelligence AI tools?


Bot: Information about the University's policy on the use of artificial intelligence tools. Artificial intelligence tools (AI) have had lots of coverage in the news recently including how they can be used in workplaces and universities. You may have heard of tools such as Chat GPT, DALLE-2, Co-Pilot, and Google Bard although there are many more available for different purposes. When it comes to your course, inappropriate use of these tools can negatively impact your learning as well as affecting your own confidence in your qualification and ability. While such tools may seem like time-savers, their potential and limitations are still not fully explored. So far, we know that some materials/information may be out of date or incorrect, and some of the information may be fictitious or contain false references and quotes. We're also aware that since AI models are trained on the data that they are exposed to, this can result in biases. So, responses or information you pull out of such tools 

You:  Explain When could I use an AI tool?


Bot: The only occasions where you may use AI tools in your assessment is if you have explicit permission from your tutor in your assessment brief. Your assessment brief will include information on how to declare any use of such tools, and you can speak to your tutor for guidance. If you do not reference your use, then this will constitute academic misconduct. Our current University policy on academic misconduct adequately covers the misuse of such tools, but we are updating them to be clearer on the matter. Unauthorised use of artificially generated material (AI) in researching or presenting material for an assessment is an academic misconduct offence if you use AI tools in producing your assessment unless the use of AI tools is expressly permitted. However, even if expressly permitted, where you do not declare that you have used an artificial intelligence tool(s) in the production of your assessment, or you are dishonest about the extent to which such tools have been used, you will ha