In [4]:
#  Run Chatbot first, then show visualizations after exiting


import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AutoTokenizer
from datasets import load_dataset
import seaborn as sns
from wordcloud import WordCloud
from sentence_transformers import SentenceTransformer, util

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", framework="pt", device=-1)

# Load the Sentence Transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  

#=======================
# Preprocessing function
#=======================
def preprocess_text(text):
    """
    Preprocess the input text by cleaning and tokenizing.
    """
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove unwanted characters (non-alphabetic and numeric)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # 3. Strip extra spaces
    text = text.strip()

    return text

#===============================================
# Tokenization function for question and context
#===============================================
def tokenize_question_context(question, context, max_length=512):
    """
    Tokenize the question and context for the transformer model.
    Ensure that both are combined and managed effectively.
    """
    inputs = tokenizer(
        question,
        context,
        add_special_tokens=True,  # Add [CLS] and [SEP] tokens
        max_length=max_length,    # Set max length of input sequence
        padding="max_length",     # Pad to max length
        truncation=True,          # Truncate if longer than max length
        return_tensors="pt"      # Return as PyTorch tensors
    )
    return inputs

#==============================
# Chatbot code using embeddings
#==============================
# Updated find_best_context function using embeddings
def find_best_context(question, contexts):
    # Generate embedding for the question
    question_embedding = embedding_model.encode(question, convert_to_tensor=True)
    print(f"Question embedding generated: {question_embedding}")

    # Generate embeddings for all contexts
    context_embeddings = embedding_model.encode(contexts, convert_to_tensor=True)
    print(f"Context embeddings generated with shape: {context_embeddings.shape}")

    cosine_scores = util.pytorch_cos_sim(question_embedding, context_embeddings)

    best_context_index = cosine_scores.argmax()
    best_context = contexts[best_context_index]
    best_score = cosine_scores[0][best_context_index].item()  # Get the score of the best context

    # Log details for debugging
    print(f"Best context selected: {best_context[:100]}...")  # Show the first 100 characters
    print(f"Cosine score of best context: {best_score}")

    return best_context

# Load your data
train_file_path = 'C:/USD/Natural Language Processing and GenAI AAI 520/Final Project/archive/train-v1.1.json'
dev_file_path = 'C:/USD/Natural Language Processing and GenAI AAI 520/Final Project/archive/dev-v1.1.json'

with open(train_file_path, "r") as train_file:
    train_data = json.load(train_file)
with open(dev_file_path, "r") as dev_file:
    dev_data = json.load(dev_file)

contexts = []
questions = []
answers = []
answer_positions = []

#============================================================
# Populate contexts, questions, answers, and answer positions
#============================================================
for article in train_data['data']:
    for paragraph in article['paragraphs']:
        # Preprocess the context before appending
        preprocessed_context = preprocess_text(paragraph['context'])
        contexts.append(preprocessed_context)  # Add preprocessed context to contexts list
        
        for qa in paragraph['qas']:
            # Preprocess the question
            preprocessed_question = preprocess_text(qa['question'])
            questions.append(preprocessed_question)  # Add preprocessed question to questions list
            
            for answer in qa['answers']:
                # Preprocess the answer
                preprocessed_answer = preprocess_text(answer['text'])
                answers.append(preprocessed_answer)  # Add preprocessed answer to answers list
                answer_positions.append(answer['answer_start'])  # Add answer start position

# Chatbot loop
while True:
    question = input("Ask a question: ").strip()

    if not question:
        print("Question cannot be empty. Please ask a valid question.")
        continue

    if question.lower() == "exit":
        print("Exiting the chatbot.")
        break

    preprocessed_question = preprocess_text(question)
    print(f"Preprocessed question: {preprocessed_question}")

    # Use a fixed context for testing
    test_context = "The Carolina Panthers lost linebacker Thomas Davis to an ACL injury during the 2015 preseason."

    # Log the context being used for inference
    print(f"Running inference with question: '{preprocessed_question}' and test context: '{test_context[:100]}...'")

    try:
        # Get the answer from the model
        result = qa_pipeline(question=preprocessed_question, context=test_context)
        print(f"Answer: {result['answer']}")
    except Exception as e:
        print(f"Error during model inference: {str(e)}")
    

#========================
# Visualization functions
#========================
def plot_visuals(contexts, questions, answers, answer_positions):
    
    # Plot 1: Distribution of Context Lengths
    context_lengths = [len(context) for context in contexts]
    plt.figure(figsize=(10, 6))
    sns.histplot(context_lengths, kde=True, bins=30)
    plt.title('Distribution of Context Lengths')
    plt.xlabel('Length of Context (in characters)')
    plt.ylabel('Frequency')
    plt.show()
    
    # Plot 2: Distribution of Question Lengths
    question_lengths = [len(question) for question in questions]
    plt.figure(figsize=(10, 6))
    sns.histplot(question_lengths, kde=True, bins=30, color='orange')
    plt.title('Distribution of Question Lengths')
    plt.xlabel('Length of Question (in characters)')
    plt.ylabel('Frequency')
    plt.show()
    
    # Plot 3: Distribution of Answer Lengths
    answer_lengths = [len(answer) for answer in answers]
    plt.figure(figsize=(10, 6))
    sns.histplot(answer_lengths, kde=True, bins=30, color='purple')
    plt.title('Distribution of Answer Lengths')
    plt.xlabel('Length of Answer (in characters)')
    plt.ylabel('Frequency')
    plt.show()
    
    # Plot 4: Distribution of Answer Start Positions in Context
    plt.figure(figsize=(10, 6))
    sns.histplot(answer_positions, kde=True, bins=30, color='teal')
    plt.title('Distribution of Answer Start Positions in Context')
    plt.xlabel('Answer Start Position')
    plt.ylabel('Frequency')
    plt.show()
    
    # Plot 5: Most Common First Words in Questions
    first_words = [question.split()[0].lower() for question in questions]
    word_count = pd.Series(first_words).value_counts().head(10)  # Top 10 most common words
    plt.figure(figsize=(10, 6))
    sns.barplot(x=word_count.index, y=word_count.values, palette='viridis')
    plt.title('Most Common First Words in Questions')
    plt.xlabel('First Word of Question')
    plt.ylabel('Count')
    plt.show()
    
    # Plot 6: Word Cloud for Contexts
    combined_contexts = ' '.join(contexts)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(combined_contexts)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Contexts')
    plt.show()
    
    # Plot 7: Word Cloud for Answers
    combined_answers = ' '.join(answers)
    wordcloud_answers = WordCloud(width=800, height=400, background_color='white').generate(combined_answers)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud_answers, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Answers')
    plt.show()
    
    # Plot 8: Comparison of Context Lengths and Answer Lengths
    context_answer_pairs = [(len(context), len(answer)) for context, answer in zip(contexts, answers)]
    df_lengths = pd.DataFrame(context_answer_pairs, columns=['Context Length', 'Answer Length'])
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Context Length', y='Answer Length', data=df_lengths, alpha=0.5, color='red')
    plt.title('Comparison of Context Lengths and Answer Lengths')
    plt.xlabel('Context Length (in characters)')
    plt.ylabel('Answer Length (in characters)')
    plt.show()

# Call the visualization function after chatbot
plot_visuals(contexts, questions, answers, answer_positions)

Preprocessed question: how many panthers players were chosen for the 2015 seasons pro bowl


KeyboardInterrupt: 