In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Step 1: Load the dataset
file_path = "/content/history_dataset  - history_qa_dataset.csv"  # Update with your file's path
data = pd.read_csv(file_path)

In [None]:
# Step 2: Preprocess the data
questions = data['Question'].tolist()
answers = data['Answer'].tolist()

In [None]:
# Step 3: Train the model (TF-IDF Vectorizer)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(questions)

# Helper function to suggest similar questions
def suggest_similar_questions(query, top_n=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    similar_indices = similarities.argsort()[0, -top_n:][::-1]

    suggestions = []
    for idx in similar_indices:
        if similarities[0][idx] > 0:  # Only consider positive similarity scores
            suggestions.append(questions[idx])
    return suggestions

In [None]:
# Step 4: Define the chatbot function
def chatbot():
    print("Welcome to the History Chatbot!")
    print("You can ask questions from the textbook, and I'll try my best to answer them.")
    print("If you're unsure how to phrase your question, I can suggest similar ones.")
    print("Type 'exit' to end the session.")

    while True:
        user_query = input("\nType your question: ").strip()
        if user_query.lower() == "exit":
            print("Thank you for using the History Chatbot. Goodbye!")
            break

        # Process the user's query
        query_vector = vectorizer.transform([user_query])
        similarities = cosine_similarity(query_vector, tfidf_matrix)

        # Get the highest similarity score
        max_similarity_index = np.argmax(similarities)
        max_similarity_score = similarities[0][max_similarity_index]

        # Threshold to determine if the question is out of scope
        threshold = 0.3  # You can adjust this value

        if max_similarity_score > threshold:
            response = answers[max_similarity_index]
            print(f"Answer: {response}")
        else:
            print("I'm not sure about that. It seems your question isn't directly from the textbook.")
            print("Here are some similar questions you could try:")
            suggestions = suggest_similar_questions(user_query)
            for i, suggestion in enumerate(suggestions, start=1):
                print(f"{i}. {suggestion}")
            print("Please refine your question based on these suggestions or ask something else.")

In [None]:
# Step 5: Run the chatbot
if __name__ == "__main__":
    chatbot()

Welcome to the History Chatbot!
You can ask questions from the textbook, and I'll try my best to answer them.
If you're unsure how to phrase your question, I can suggest similar ones.
Type 'exit' to end the session.

Type your question: material
Answer: Material sources include objects, monuments, places, coins, and sculptures. Examples are buildings, bridges, and forts from the British period, such as the Cellular Jail in Andaman.

Type your question: exit
Thank you for using the History Chatbot. Goodbye!


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Load the test dataset
test_file_path = "/content/history_dataset  - history_qa_dataset.csv"  # Update with your test file path
test_data = pd.read_csv(test_file_path)

# Ensure data is in expected format
assert "Question" in test_data.columns and "Answer" in test_data.columns, "CSV must have 'Question' and 'Answer' columns."

# Step 2: Extract test questions and answers
test_questions = test_data['Question'].tolist()
test_answers = test_data['Answer'].tolist()

# Step 3: Evaluate the chatbot model
correct_predictions = 0
threshold = 0.3  # Similarity score threshold to consider a match

for i, test_question in enumerate(test_questions):
    # Transform the test question into the TF-IDF vector space
    test_vector = vectorizer.transform([test_question])
    similarities = cosine_similarity(test_vector, tfidf_matrix)

    # Get the most similar question from the training data
    max_similarity_index = np.argmax(similarities)
    max_similarity_score = similarities[0][max_similarity_index]

    # If the similarity score exceeds the threshold, retrieve the answer
    if max_similarity_score > threshold:
        predicted_answer = answers[max_similarity_index]
    else:
        predicted_answer = None  # Indicate no confident match found

    # Compare the predicted answer with the actual test answer
    if predicted_answer == test_answers[i]:
        correct_predictions += 1

# Step 4: Calculate accuracy
total_questions = len(test_questions)
accuracy = (correct_predictions / total_questions) * 100

print(f"Accuracy of the chatbot model: {accuracy:.2f}%")


Accuracy of the chatbot model: 92.98%


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

file_path = "history_dataset.csv"
data = pd.read_csv(file_path)

assert "Question" in data.columns and "Answer" in data.columns, "CSV must have 'Question' and 'Answer' columns."

def preprocess(text):
    tokens = [word for word in text.split() if word.lower() not in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens]
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    return ' '.join(lemmatized)

data['ProcessedQuestion'] = data['Question'].apply(preprocess)
questions = data['ProcessedQuestion'].tolist()
answers = data['Answer'].tolist()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(questions)

def chatbot():
    print("Welcome to the History Chatbot!")
    print("You can ask questions using keywords like '1784', 'Powada', or names like 'Sir William Jones'.")
    print("Type 'exit' to end the session.")

    while True:
        user_query = input("\nType your question: ").strip()
        if user_query.lower() == "exit":
            print("Thank you for using the History Chatbot. Goodbye!")
            break

        query = preprocess(user_query)
        query_vector = vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, tfidf_matrix)

        max_similarity_index = np.argmax(similarities)
        max_similarity_score = similarities[0][max_similarity_index]

        threshold = 0.3

        if max_similarity_score > threshold:
            response = answers[max_similarity_index]
            print(f"Answer: {response}")
        else:
            print("I'm not sure about that. Try using keywords or rephrasing your question.")

if _name_ == "_main_":
    chatbot()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
