In [1]:
import nltk  # Natural Language Toolkit for text processing
import string  # For handling punctuation
import random  # For generating random responses
import numpy as np  # For numerical operations
from nltk.stem import WordNetLemmatizer  # For word lemmatization
from sklearn.feature_extraction.text import TfidfVectorizer  # For text vectorization
from sklearn.metrics.pairwise import cosine_similarity  # For finding similarity
import sys  # For handling system exit in Colab

In [2]:
# Download necessary NLTK data files
nltk.download('punkt')  # Tokenizer data
nltk.download('wordnet')  # WordNet lemmatizer data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load and preprocess dataset
try:
    with open('/content/drive/MyDrive/ai tryouts/dialogs.txt', 'r', errors='ignore') as f:
        raw = f.read().lower()  # Read file and convert to lowercase
        if not raw.strip():
            print("ERROR: dialogs.txt is empty. Please add conversation data.")
            sys.exit()
except FileNotFoundError:
    print("ERROR: dialogs.txt not found. Check the file path and try again.")
    sys.exit()

SyntaxError: unterminated string literal (detected at line 3) (<ipython-input-3-4381196d00be>, line 3)

In [None]:
# Split dataset into question-answer pairs
qa_pairs = [line.split('\t') for line in raw.split('\n') if '\t' in line]
question_answer_dict = {q.strip(): a.strip() for q, a in qa_pairs}
sent_tokens = list(question_answer_dict.keys())  # Store only questions

if len(sent_tokens) < 2:
    print("ERROR: Not enough data to train the chatbot. Add more conversations to dialogs.txt.")
    sys.exit()

In [None]:
# Initialize lemmatizer
lemmer = WordNetLemmatizer()

def LemTokens(tokens):
    """Lemmatize a list of tokens"""
    return [lemmer.lemmatize(token) for token in tokens]

def LemNormalize(text):
    """Normalize text by removing punctuation and lemmatizing words"""
    remove_punc_dict = dict((ord(punct), None) for punct in string.punctuation)
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punc_dict)))

In [None]:
def response(user_response):
    """Generate a chatbot response based on dataset matching"""
    user_response = user_response.strip()
    if user_response in question_answer_dict:
        return question_answer_dict[user_response]  # Direct match from dataset

    # If no direct match, use TF-IDF similarity
    sent_tokens.append(user_response)  # Temporarily add user input
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)  # Compute similarity with all sentences
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    sent_tokens.pop()  # Remove user input after processing

    if req_tfidf == 0:
        return "I'm sorry! I don't understand you. Can you rephrase?"
    else:
        return question_answer_dict.get(sent_tokens[idx], "I'm sorry! I don't understand you.")


In [None]:
# Chatbot loop
print("BOT: My name is Stark. Let's chat! Type 'bye' to exit.")
while True:
    try:
        user_response = input("You: ").strip().lower()  # Get user input
        if not user_response:
            print("BOT: Please enter some text to continue the conversation.")
            continue
        if user_response == 'bye':
            print("BOT: Goodbye! Take care.")
            print("BOT: Chat session ended.")
            break
        elif user_response in ('thanks', 'thank you'):
            print("BOT: You're welcome!")
            print("BOT: Chat session ended.")
            break
        else:
            bot_response = greet(user_response)  # Check if greeting
            if bot_response:
                print("BOT:", bot_response)
            else:
                print("BOT:", response(user_response))  # Generate response using dataset or TF-IDF
    except KeyboardInterrupt:
        print("\nBOT: Chat session interrupted. Goodbye!")
        break


BOT: My name is Stark. Let's chat! Type 'bye' to exit.
You: HI
BOT: I'm glad you're talking to me!
You: HOW ARE YOU
BOT: I'm just a bot, but I'm feeling awesome!
You: BYE
BOT: Goodbye! Take care.
BOT: Chat session ended.
