In [25]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast  # For safely converting string lists to Python lists

# Load the dataset
print("Processing data...")
data = pd.read_csv("./data/processed/cornell_movie_dialogs_lines.csv")

# Load the conversation structure to get replies
conv_data = pd.read_csv("./data/processed/cornell_movie_dialogs_conversations.csv")

# Create a dictionary mapping line IDs to actual text
lines_dict = dict(zip(data["lineID"], data["text"]))

# Create a list of (input, response) pairs from conversations
conversations = []
for conv in conv_data["utteranceIDs"]:  
    conv = ast.literal_eval(conv)  # Convert string list to actual Python list
    for i in range(len(conv) - 1):
        if conv[i] in lines_dict and conv[i + 1] in lines_dict:
            conversations.append((lines_dict[conv[i]], lines_dict[conv[i + 1]]))

# Separate inputs and responses
inputs, responses = zip(*conversations)

# Remove any None or NaN values from inputs and responses
inputs = [text for text in inputs if isinstance(text, str) and text.strip() != '']
responses = [text for text in responses if isinstance(text, str) and text.strip() != '']

print("Data processed.")

# Vectorize only the input lines
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(inputs)

# Chatbot function
def get_response(user_input):
    # Vectorize the user input
    user_tfidf = vectorizer.transform([user_input])
    
    # Compute cosine similarity
    similarities = cosine_similarity(user_tfidf, tfidf_matrix)  
    
    # Find the most similar input line
    best_match_index = np.argmax(similarities)
    
    # Return the actual response from the dataset
    return responses[best_match_index]

# Start chat loop
print("Waiting for user input...")
while True:
    user_input = input("user:> ")  # Get user input
    if user_input.lower() in ["exit", "quit", "bye"]:  # Exit condition
        print("VEGA:> Goodbye!")
        break
    response = get_response(user_input)  # Get best response
    print(f"VEGA:> {response}")  # Print bot's response


Processing data...
Data processed.
Waiting for user input...


user:>  hello


VEGA:> Fuck you.


user:>  quit


VEGA:> Goodbye!
