In [2]:
import io
import pickle
import random
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer


# Download necessary NLTK data
# nltk.download('punkt')
# nltk.download('wordnet')

# Function to load and preprocess your dataset
def load_and_preprocess_data(file_path):
    with io.open(file_path, 'r', encoding='utf-8') as file:
        corpus = file.read()

    return corpus


# Tokenization function
def tokenize_sentences(corpus):
    sentences = nltk.sent_tokenize(corpus)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    return word_tokens


# Lemmatization function
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [[lemmatizer.lemmatize(word.lower()) for word in sentence] for sentence in tokens]
    
    return lemmatized_tokens, lemmatizer


# Function to create TF-IDF matrix
def create_tfidf_matrix(lemmatized_corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([' '.join(tokens) for tokens in lemmatized_corpus])
    
    return vectorizer, tfidf_matrix, lemmatized_corpus


# Function to save processed data using pickle
def save_processed_data(data, file_path='processed_data.pkl'):
    
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

        
# Function for chatbot response
def chatbot_response(user_tfidf, X_train_tfidf, lemmatized_corpus, vectorizer, user_input, threshold=0.2):
    
    # Calculate cosine similarities
    cosine_similarities = cosine_similarity(user_tfidf, X_train_tfidf).flatten()
    most_similar_index = np.argmax(cosine_similarities)
    highest_similarity = cosine_similarities[most_similar_index]

    # Check if the highest similarity is below the threshold
    if highest_similarity < threshold:
        return "I'm sorry, I didn't understand."

    # Check if the lowercased user input is a substring of the lowercased response tokens
    if user_input.lower() not in ' '.join(lemmatized_corpus[most_similar_index]).lower():
        return "I'm sorry, I didn't understand."

    # If the conditions are not met, proceed to generate the response
    response_tokens = lemmatized_corpus[most_similar_index]

    # Extract relevant information based on user's question
    extracted_info = extract_information(user_input, response_tokens)

    # If no information is extracted, use the whole response
    if not extracted_info:
        extracted_info = response_tokens

    chatbot_response_str = ' '.join(extracted_info).strip()

    # Remove the user's question part from the response
    chatbot_response_str = chatbot_response_str.lstrip(user_input.lower())
    
    # Remove leading punctuation including a comma
    chatbot_response_str = chatbot_response_str.lstrip(string.punctuation).strip()

    # Remove leading comma if present
    if chatbot_response_str.startswith(","):
        chatbot_response_str = chatbot_response_str[1:].strip()

    return chatbot_response_str


# Function to extract information from user input
def extract_information(user_input, response_tokens):
    
    lemmatized_user_input = lemmatize_tokens([nltk.word_tokenize(user_input)])[0]

    extracted_info = []

    # Extract relevant information based on user's question
    if any(word in lemmatized_user_input for word in ['name', 'called']):
        extracted_info = [word for word in response_tokens if word.lower() not in ['my', 'name', 'is', 'called']]
    elif any(word in lemmatized_user_input for word in ['age']):
        extracted_info = [word for word in response_tokens if word.lower() not in ['my', 'age', 'is']]

    return extracted_info

# File path for the dataset
file_path = './data/basic_details.txt'
corpus = load_and_preprocess_data(file_path)

# Tokenize and lemmatize
tokenized_sentences = tokenize_sentences(corpus)
lemmatized_tokens, lemmatizer = lemmatize_tokens(tokenized_sentences)

# Create TF-IDF matrix
vectorizer, tfidf_matrix, lemmatized_corpus = create_tfidf_matrix(lemmatized_tokens)

# Save the processed data and model objects using pickle
save_processed_data(tfidf_matrix, file_path='processed_data.pkl')

# Chat loop
while True:
    user_input = input("You: ")

    # Add a condition to exit the loop if the user wants to end the conversation
    if user_input.lower() == 'exit':
        print("Chatbot: Bye.")
        break

    user_input_tokens = nltk.word_tokenize(user_input)
    user_input_lemmatized = lemmatize_tokens([user_input_tokens])[0]
    user_input_lemmatized_sentence = ' '.join(' '.join(sentence) for sentence in user_input_lemmatized)

    user_tfidf = vectorizer.transform([user_input_lemmatized_sentence])

    # Chatbot response
    response = chatbot_response(user_tfidf, tfidf_matrix, lemmatized_corpus, vectorizer, user_input)

    # Print each sentence in the chatbot's response separately
    for sentence in response.split('. '):
        print("Chatbot:", sentence)


You: hi
Chatbot: hello .
You: how are you
Chatbot: i am fine .
You: what is your name
Chatbot: my name is chatbot .
You: what is your age
Chatbot: i am robo , i donot have age .
You: what is your use
Chatbot: i am robot .
You: what will you do
Chatbot: i will assist you .
You: thank you
Chatbot: have a great day .
You: how is life going on
Chatbot: I'm sorry, I didn't understand.
You: how will you help me
Chatbot: I'm sorry, I didn't understand.
You: exit
Chatbot: Bye.
