In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.pipeline import Pipeline

# Load the data from CSV
data = pd.read_csv('rd.csv')
similarity_threshold = 0.5
greetings = ["hello", "hi", "hey", "Good morning", "good evening", "good after-noon", "namaste"]

# Preprocess the data
data['question'] = data['question'].apply(lambda x: x.lower())  # Convert to lowercase

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['question'], data['answer'], test_size=0.2, random_state=42)

# Define a pipeline with both TfidfVectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Save the pipeline (including both the TfidfVectorizer and the model)
dump(pipeline, 'tfidf_vectorizer.joblib')

# Load the pipeline (including both the TfidfVectorizer and the model)
loaded_pipeline = load('tfidf_vectorizer.joblib')

def predict_answer(question):
    # Preprocess the question
    question = question.lower()

    # Check if the question is a greeting
    if question in greetings:
        return "Hello i am chatbot of infinityBrains ! How can I assist you?"

    # Predict the answer using the loaded pipeline
    answer = loaded_pipeline.predict([question])[0]

    # Calculate cosine similarity between input question and training questions
    question_tfidf = loaded_pipeline.named_steps['tfidf'].transform([question])
    similarities = cosine_similarity(question_tfidf, loaded_pipeline.named_steps['tfidf'].transform(X_train))

    # Find the maximum similarity and corresponding index
    max_similarity_index = similarities.argmax()
    max_similarity = similarities.max()

    # Retrieve the corresponding question and answer with the maximum similarity
    most_similar_question = X_train.iloc[max_similarity_index]
    most_similar_answer = y_train.iloc[max_similarity_index]

    # Check if the maximum similarity meets the threshold
    if max_similarity >= similarity_threshold:
        return f"Answer: {answer}, Similarity: {max_similarity}, Most similar question: {most_similar_question}, Most similar answer: {most_similar_answer}"
    else:
        return "As of 2024, I'm not familiar enough with that topic to provide an accurate response. Is there anything else I can help you with?"

# Example usage
input_question = "How does machine learning work?"
print(predict_answer(input_question))


As of 2024, I'm not familiar enough with that topic to provide an accurate response. Is there anything else I can help you with?
