In [46]:
import math
import pandas as pd

In [47]:
csv_data = pd.read_csv('database.csv')
# read values of Questions and Answers
questions = csv_data['Question']
answers = csv_data['Answer']
csv_data

Unnamed: 0,Question,Answer
0,When did the Department of Computer Engineerin...,The Department of Computer Engineering start I...
1,In which Faculty does the Department of Comput...,The Department of Computer Engineering works i...
2,Where is the office of the Department of Compu...,"In the Faculty of Engineering and Technology, ..."
3,When was the Faculty of Engineering and Techno...,The Faculty of Engineering and Technology (FET...
4,What are the courses offered in FET?,"Courses offered in FET are BTech, MTech, MSc a..."
5,What are the branches in BTech?,"Branches in BTech are Computer Engineering, El..."
6,What is the vision of the Department of Comput...,To produce excellent professionals and innovat...


In [48]:
# lowercasing and removing punctuation
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

# some list of stop words
stop_words = ["a", "an", "and", "the", "in", "on", "of", "to", "for", "with"]

# Remove stop words
def remove_stop_words(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # join them into a sentence
    return ' '.join(words)


['when did department computer engineering start',
 'which faculty does department computer engineering work',
 'where is office department computer engineering',
 'when was faculty engineering technology fet formed',
 'what are courses offered fet',
 'what are branches btech',
 'what is vision department computer engineering']

In [49]:
def cosine_similarity(vector_a, vector_b):
    # calculating the unit vectors of a and b (squaring and summing)    
    norm_a = math.sqrt(sum(vector_a[word] ** 2 for word in vector_a))
    norm_b = math.sqrt(sum(vector_b[word] ** 2 for word in vector_b))
    
    if (norm_a == 0 or norm_b == 0):
        return 0
    # for each of the frequencies in vector_a multiply with each of the corresponding frequency of the word in vector_b
    dot_product = sum(vector_a[word] * vector_b.get(word, 0) for word in vector_a)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [50]:
# Calculate TF (Term Frequency), number of times a word occurs in a text
def calculate_tf(text):
    words = text.split()
    tf_dict = {}
    for word in words:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
    return tf_dict

# Calculate IDF (Inverse Document Frequency)
def calculate_idf(corpus, word):
    num_documents_with_word = sum(1 for document in corpus if word in document)
    return math.log(len(corpus) / (1 + num_documents_with_word))

# Calculate TF-IDF
def calculate_tfidf(tf, idf):
    tfidf = {}
    for word, freq in tf.items():
        tfidf[word] = freq * idf[word]
    return tfidf

In [51]:
# Take user input
user_input = input("Enter your question: ")
preprocessed_user_input = preprocess_text(user_input)
preprocessed_user_input = remove_stop_words(preprocessed_user_input)

In [52]:
# Preprocess and calculate TF-IDF for user input
preprocessed_user_input_tf = calculate_tf(preprocessed_user_input)
preprocessed_user_input_idf = {word: calculate_idf(questions, word) for word in preprocessed_user_input_tf}
preprocessed_user_input_tfidf = calculate_tfidf(preprocessed_user_input_tf, preprocessed_user_input_idf)

# Calculate cosine similarities and find closest answer
max_similarity = -1
closest_answer = ""

#  for each of the questions check similarity by cosine similarity
for idx, question in enumerate(questions):
    preprocessed_question = preprocess_text(question)
    preprocessed_question_tf = calculate_tf(preprocessed_question)
    preprocessed_question_idf = {word: calculate_idf(questions, word) for word in preprocessed_question_tf}
    preprocessed_question_tfidf = calculate_tfidf(preprocessed_question_tf, preprocessed_question_idf)
    
    similarity = cosine_similarity(preprocessed_user_input_tfidf, preprocessed_question_tfidf)
    
    if similarity > max_similarity:
        max_similarity = similarity
        closest_answer = answers[idx]

print("Question: ", user_input)
print("Closest Answer:", closest_answer)

Question:  Courses in FET?
Closest Answer: Courses offered in FET are BTech, MTech, MSc and PhD
