In [3]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast  # For safely converting string lists to Python lists
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the dataset
print("Processing data...")
data = pd.read_csv("./data/processed/cornell_movie_dialogs_lines.csv")

# Load the conversation structure to get replies
conv_data = pd.read_csv("./data/processed/cornell_movie_dialogs_conversations.csv")

# Create a dictionary mapping line IDs to actual text
lines_dict = dict(zip(data["lineID"], data["text"]))

# Create a list of (input, response) pairs from conversations
conversations = []
for conv in conv_data["utteranceIDs"]:  
    conv = ast.literal_eval(conv)  # Convert string list to actual Python list
    for i in range(len(conv) - 1):
        if conv[i] in lines_dict and conv[i + 1] in lines_dict:
            conversations.append((lines_dict[conv[i]], lines_dict[conv[i + 1]]))

# Separate inputs and responses
inputs, responses = zip(*conversations)

# Remove any None or NaN values from inputs and responses
inputs = [text for text in inputs if isinstance(text, str) and text.strip() != '']
responses = [text for text in responses if isinstance(text, str) and text.strip() != '']

print("Data processed.")

# Vectorize only the input lines
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(inputs)

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Chatbot function using GPT-2
conversation_history = []

def generate_response(user_input):
    # Add user input to the conversation history
    conversation_history.append(f"user: {user_input}")

    # Join the conversation history to provide context for the model
    context = " ".join(conversation_history)

    # Encode the input using GPT-2 tokenizer
    input_ids = tokenizer.encode(context, return_tensors='pt')

    # Generate a response using GPT-2
    output = model.generate(
        input_ids,
        max_length=200,         # Max length of the entire response
        num_return_sequences=1, # Number of responses to generate
        do_sample=True,         # Enable random sampling
        temperature=0.8,        # Control creativity
        top_p=0.9,              # Nucleus sampling
        top_k=50,               # Top-k sampling
        no_repeat_ngram_size=3, # Prevent repeating n-grams
        pad_token_id=tokenizer.eos_token_id  # Set padding token id
    )

    # Decode the output and extract the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Add the bot's response to the conversation history
    conversation_history.append(f"VEGA: {response}")
    return response

# Chatbot loop
print("Waiting for user input...")
while True:
    user_input = input("user:> ")  # Get user input
    if user_input.lower() in ["exit", "quit", "bye"]:  # Exit condition
        print("VEGA:> Goodbye!")
        break
    response = generate_response(user_input)  # Get GPT-2 response
    print(f"VEGA:> {response}")  # Print bot's response

RuntimeError: Failed to import transformers.models.gpt2.modeling_gpt2 because of the following error (look up to see its traceback):
module 'pkgutil' has no attribute 'ImpImporter'