In [45]:
import json
import numpy as np
import random
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, hinge_loss
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.filterwarnings('ignore')

# Load dataset
with open('Combined_training.json') as file:
    data = json.load(file)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = ''.join([char for char in sentence if char not in string.punctuation])
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Extract patterns and tags
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(preprocess(pattern))
        tags.append(intent['tag'])

# Convert tags to numerical labels
unique_tags = list(set(tags))
tag_to_num = {tag: num for num, tag in enumerate(unique_tags)}
num_to_tag = {num: tag for tag, num in tag_to_num.items()}
y = np.array([tag_to_num[tag] for tag in tags])


In [46]:
# Vectorize patterns using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(patterns)

# Split the dataset into training and validation sets
X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the model on the training set and measure training time
start_time = time.time()
svm_model_tfidf = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear', probability=True))
svm_model_tfidf.fit(X_train_tfidf, y_train_tfidf)
training_time_tfidf = time.time() - start_time

# Predict on the validation set
y_val_pred_tfidf = svm_model_tfidf.predict(X_val_tfidf)

# Calculate validation accuracy
val_accuracy_tfidf = accuracy_score(y_val_tfidf, y_val_pred_tfidf)

# Calculate hinge loss on training and validation sets
y_train_decision_tfidf = svm_model_tfidf.decision_function(X_train_tfidf)
y_val_decision_tfidf = svm_model_tfidf.decision_function(X_val_tfidf)
train_hinge_loss_tfidf = hinge_loss(y_train_tfidf, y_train_decision_tfidf, labels=list(tag_to_num.values()))
val_hinge_loss_tfidf = hinge_loss(y_val_tfidf, y_val_decision_tfidf, labels=list(tag_to_num.values()))

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores_tfidf = cross_val_score(svm_model_tfidf, X_tfidf, y, cv=kf, scoring='accuracy')

# Print TF-IDF model metrics
print("\nTF-IDF Model Metrics:")
print(f'Training Time: {training_time_tfidf:.2f} seconds')
print(f'Validation Accuracy: {val_accuracy_tfidf:.4f}')
print(f'Train Hinge Loss: {train_hinge_loss_tfidf:.4f}')
print(f'Validation Hinge Loss: {val_hinge_loss_tfidf:.4f}')
print(f'Cross-Validation Scores: {cross_val_scores_tfidf}')
print(f'Mean Cross-Validation Score: {np.mean(cross_val_scores_tfidf):.4f}')



TF-IDF Model Metrics:
Training Time: 0.01 seconds
Validation Accuracy: 1.0000
Train Hinge Loss: 0.0000
Validation Hinge Loss: 0.0000
Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0000


In [47]:
def get_response_tfidf(user_input):
    user_input = preprocess(user_input)
    X_user = tfidf_vectorizer.transform([user_input])
    tag_prob = svm_model_tfidf.predict_proba(X_user)[0]
    tag_index = np.argmax(tag_prob)
    tag = num_to_tag[tag_index]
    
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

print("Chatbot using TF-IDF is ready! Type 'quit' to exit.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response_tfidf(user_input)
    print("Bot:", response)


Chatbot using TF-IDF is ready! Type 'quit' to exit.


You:  hi


Bot: Good day! What do you need help with?


You:  hello


Bot: Hi there! What can I help you with?


You:  what is stance on ai tools


Bot: The University's academic integrity policy (UPR AS14 Appendix III) [https://www.herts.ac.uk/__data/assets/pdf_file/0007/237625/AS14-Apx3-Academic-Misconduct-v17.0.pdf]sets out our stance on plagiarism including fake referencing which can often be the case with AI tools. Therefore, it is crucial thatyou do not use AI toolsto generate an assessment and submit it as your own work; to do so will constitute academic misconduct.


You:  what is external examiners


Bot: The role of the External Examiner In the UK Higher Education system, Universities are responsible for the academic standards of the awards that they offer, and for the quality of the education they provide to enable students to meet those standards. The role of the external examiner is an essential part of the University’s quality assurance processes. They essentially externally ‘audit’ the programmes that they are appointed to, in terms of the attainment of academic standards and the quality of the education. The University of Hertfordshire appoints external examiners mainly from other Universities, but also from industry and/or the professions. They are qualified and experienced in the subject, have an understanding of the academic standards required for the award and are also independent of the University of Hertfordshire. As such, they are able to provide carefully considered advice on the academic standards of the programmes and/or modules to which they have been appointed, a

You:  quit


In [1]:
import json
import numpy as np
import random
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import hinge_loss, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import warnings

warnings.filterwarnings('ignore')

# Load dataset
with open('Combined_training.json') as file:
    data = json.load(file)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = ''.join([char for char in sentence if char not in string.punctuation])
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Extract patterns and tags
patterns = []
tags = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(preprocess(pattern))
        tags.append(intent['tag'])

# Convert tags to numerical labels
unique_tags = list(set(tags))
tag_to_num = {tag: num for num, tag in enumerate(unique_tags)}
num_to_tag = {num: tag for tag, num in tag_to_num.items()}
y = np.array([tag_to_num[tag] for tag in tags])

# Vectorize patterns
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(patterns)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:


# Define the SVM model
svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(probability=True))

# Define the parameter grid for GridSearchCV
param_grid = {
    'svc__C': [0.01, 0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params_grid = grid_search.best_params_
print(f"Best parameters (Grid Search): {best_params_grid}")

# Train and evaluate the Grid Search best model
svm_model_grid = make_pipeline(
    StandardScaler(with_mean=False), 
    SVC(C=best_params_grid['svc__C'], degree=best_params_grid['svc__degree'], gamma=best_params_grid['svc__gamma'], kernel=best_params_grid['svc__kernel'], probability=True)
)
svm_model_grid.fit(X_train, y_train)

y_val_pred_grid = svm_model_grid.predict(X_val)
val_accuracy_grid = accuracy_score(y_val, y_val_pred_grid)

print(f"Validation Accuracy (Grid Search): {val_accuracy_grid:.4f}")


Best parameters (Grid Search): {'svc__C': 0.01, 'svc__degree': 2, 'svc__gamma': 1, 'svc__kernel': 'poly'}
Validation Accuracy (Grid Search): 1.0000


In [3]:


# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'svc__C': np.logspace(-2, 2, 10),  # 10 values between 0.01 and 100
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree': [2, 3, 4, 5],  # Degrees for polynomial kernel
    'svc__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]  # Various gamma values
}

# Perform Random Search with Cross-Validation
random_search = RandomizedSearchCV(svm_model, param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best parameters from Random Search
best_params_random = random_search.best_params_
print(f"Best parameters (Random Search): {best_params_random}")

# Train and evaluate the Random Search best model
svm_model_random = make_pipeline(
    StandardScaler(with_mean=False), 
    SVC(C=best_params_random['svc__C'], degree=best_params_random['svc__degree'], gamma=best_params_random['svc__gamma'], kernel=best_params_random['svc__kernel'], probability=True)
)
svm_model_random.fit(X_train, y_train)

y_val_pred_random = svm_model_random.predict(X_val)
val_accuracy_random = accuracy_score(y_val, y_val_pred_random)

print(f"Validation Accuracy (Random Search): {val_accuracy_random:.4f}")


Best parameters (Random Search): {'svc__kernel': 'linear', 'svc__gamma': 10, 'svc__degree': 4, 'svc__C': 35.93813663804626}
Validation Accuracy (Random Search): 1.0000


In [4]:

# Select the best model
if val_accuracy_grid > val_accuracy_random:
    best_model = svm_model_grid
    best_params = best_params_grid
    print("Best model selected from Grid Search.")
else:
    best_model = svm_model_random
    best_params = best_params_random
    print("Best model selected from Random Search.")

# Calculate hinge loss on training and validation sets
y_train_decision = best_model.decision_function(X_train)
y_val_decision = best_model.decision_function(X_val)
train_hinge_loss = hinge_loss(y_train, y_train_decision, labels=list(tag_to_num.values()))
val_hinge_loss = hinge_loss(y_val, y_val_decision, labels=list(tag_to_num.values()))

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy')

# Print metrics
print("\nAverage Metrics:")
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred_grid if best_model == svm_model_grid else y_val_pred_random):.4f}')
print(f'Train Hinge Loss: {train_hinge_loss:.4f}')
print(f'Validation Hinge Loss: {val_hinge_loss:.4f}')
print(f'Cross-Validation Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Score: {np.mean(cross_val_scores):.4f}')


Best model selected from Random Search.

Average Metrics:
Validation Accuracy: 1.0000
Train Hinge Loss: 0.0000
Validation Hinge Loss: 0.0000
Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0000


In [None]:
def get_response(user_input):
    user_input = preprocess(user_input)
    X_user = vectorizer.transform([user_input])
    tag_prob = best_model.predict_proba(X_user)[0]
    tag_index = np.argmax(tag_prob)
    tag = num_to_tag[tag_index]
    
    for intent in data['intents']:
        if intent['tag'] == tag:
            return random.choice(intent['responses'])

print("Chatbot is ready! Type 'quit' to exit.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response(user_input)
    print("Bot:", response)


Chatbot is ready! Type 'quit' to exit.


You:  may i know more about appeal


Bot: An academic appeal is a request from a student to the Dean of their School to review a decision made by the Board of Examiners about an assessment decision. If you want tomake an appeal [https://ask.herts.ac.uk/academic-appeals-requests-for-the-review-of-assessment-decisions], you must request a review of your results within 10 working days of their publication. You can also contact your programme leader or cohort leader, or Herts SU’s Advice and Support centre for guidance.


You:  what is mental health


Bot: If your studies are affecting your mental health and wellbeing then pleasecontact our Student Wellbeing team [https://ask.herts.ac.uk/student-wellbeing]for support. You can alsocall our free 24/7 wellbeing helpline [https://ask.herts.ac.uk/student-wellbeing-24-hour-helpline]ran by Health Assured whenever you need to. You can also talk to staff in your School if you’re struggling.


You:  what is lrc


Bot: Hello! How can I assist you today?


You:  explain university external examiners


Bot: The role of the External Examiner In the UK Higher Education system, Universities are responsible for the academic standards of the awards that they offer, and for the quality of the education they provide to enable students to meet those standards. The role of the external examiner is an essential part of the University’s quality assurance processes. They essentially externally ‘audit’ the programmes that they are appointed to, in terms of the attainment of academic standards and the quality of the education. The University of Hertfordshire appoints external examiners mainly from other Universities, but also from industry and/or the professions. They are qualified and experienced in the subject, have an understanding of the academic standards required for the award and are also independent of the University of Hertfordshire. As such, they are able to provide carefully considered advice on the academic standards of the programmes and/or modules to which they have been appointed, a

You:  what is assignment extensions


Bot: If you’re struggling with assignment deadlines and need more time, you could apply for an extension. If you think you need an additional seven days to hand in your assessment, you couldapply for a coursework extension [https://ask.herts.ac.uk/coursework-extensions]. You must apply for an extension before you submit your work. If you think you can’t submit your work by the deadline due to unexpected circumstances outside of your control, you couldsubmit an exceptional circumstances request. [https://ask.herts.ac.uk/exceptional-circumstances]In the first instance, please talk to your personal tutor, programme leader or cohort leader as they will be able to help and advise you. You must submit your exceptional circumstances request before you submit your work. If you are unsure or need more information, please talk to your personal tutor, programme leader or cohort leader.


You:  im struggling with assignments


Bot: Hey! How can I help?


You:  im disable


Bot: Hello! How can I assist you today?
