# Final Project

In [1]:
import pandas as pd
from textblob import TextBlob

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Show all columns
pd.set_option('display.max_columns', None)

# Show all rows (adjust if needed, can use 'None' for no limit)
pd.set_option('display.max_rows', None)

# Set the maximum width of each column (adjust if needed)
pd.set_option('display.max_colwidth', None)

In [4]:
def load_movie_lines(lines_file_path):
    lines = {}
    with open(lines_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 5:
                line_id, character_id, movie_id, character_name, text = parts
                lines[line_id] = {
                    "character_id": character_id,
                    "movie_id": movie_id,
                    "character_name": character_name,
                    "text": text
                }
    return lines


In [5]:
def load_movie_conversations(conversations_file_path):
    conversations = []
    with open(conversations_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 4:
                character1_id, character2_id, movie_id, utterance_ids = parts
                utterance_ids = utterance_ids[1:-1].replace("'", "").split(', ')
                conversations.append({
                    "character1_id": character1_id,
                    "character2_id": character2_id,
                    "movie_id": movie_id,
                    "utterances": utterance_ids
                })
    return conversations

In [6]:
def load_movie_metadata(metadata_file_path):
    movie_metadata = {}
    with open(metadata_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) > 5:
                movie_id, title, genre = parts[0], parts[1], parts[5]
                movie_metadata[movie_id] = genre
    return movie_metadata

In [7]:
def get_sentiment(text):
    sentiment = TextBlob(text).sentiment
    return sentiment.polarity  # Returns a value between -1 (negative) and 1 (positive)

In [8]:
# lines_file = "./Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
# conversation_file = "./Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"
# metadata_file = "./Cornell_Movie_Dialogue_Corpus/movie_titles_metadata.txt"
lines_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
conversation_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"
metadata_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_titles_metadata.txt"

lines = load_movie_lines(lines_file)
conversations = load_movie_conversations(conversation_file)
movie_metadata = load_movie_metadata(metadata_file)

In [None]:
# Build character ID to name mapping
character_id_to_name = {}

for line in lines.values():
    character_id = line['character_id']
    character_name = line['character_name']
    character_id_to_name[character_id] = character_name

# Initialize empty list to store conversation data
data = []

# Set the maximum history length
MAX_HISTORY_LENGTH = 5

# Define a special token for empty history
EMPTY_HISTORY_TOKEN = "[START]"

# Loop through each conversation block
for conv in conversations:
    utterance_ids = conv['utterances']
    character1_id = conv['character1_id']
    character2_id = conv['character2_id']
    character1_name = character_id_to_name.get(character1_id)
    character2_name = character_id_to_name.get(character2_id)

    # Skip conversation if character names are missing
    if not character1_name or not character2_name:
        continue

    # Initialize full conversation history
    full_conversation_history = []

    # Loop through the utterances to build conversation history and response for each block
    for i in range(len(utterance_ids)):
        response_id = utterance_ids[i]  # Current response ID

        # Ensure the response_id is in the lines dictionary
        if response_id in lines:
            response_line = lines[response_id]
            response = response_line['text']
            character_2 = response_line['character_name']  # Current speaker

            # Determine character_1 as the other character in the conversation
            if character_2 == character1_name:
                character_1 = character2_name
            else:
                character_1 = character1_name

            # Build the conversation history using a sliding window
            start_idx = max(0, i - MAX_HISTORY_LENGTH)
            history_ids = utterance_ids[start_idx:i]

            # Check if history is empty
            if not history_ids:
                conversation_history = [EMPTY_HISTORY_TOKEN]
            else:
                conversation_history = [lines[utt_id]['text'] for utt_id in history_ids if utt_id in lines]

            # Determine the history length, setting it to 0 if [START] is in conversation_history
            if conversation_history == [EMPTY_HISTORY_TOKEN]:
                history_length = 0
            else:
                history_length = len(conversation_history)

            # Append the conversation and its context to the data
            data.append({
                "movie_id": conv['movie_id'],
                "character_1": character_1,  # Person being addressed
                "character_2": character_2,  # Current speaker
                "conversation_history": conversation_history,
                "history_length": history_length,
                "response": response,
                "genre": movie_metadata.get(conv['movie_id'], [])
            })

            # Update full conversation history
            full_conversation_history.append(response)
        else:
            continue  # Skip if the response_id is missing in lines


In [None]:
# Convert to DataFrame
df = pd.DataFrame(data)

# Add Genre as meta data
df['genre'] = df['movie_id'].map(movie_metadata)

# Add sentiment to response
df['sentiment'] = df['response'].apply(get_sentiment)

# Display the DataFrame
#df.head(130:140)
df.iloc[128:141]

In [None]:
df.to_csv("/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/cleaned_conversations_no_genre.csv", index=False)

#"/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/cleaned_conversations_ohe.csv"

## EDA

In [None]:
#import matplotlib as plt
import matplotlib.pyplot as plt
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import seaborn as sns
import ast
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Set seaborn style
sns.set_theme(style='whitegrid')

In [None]:
df.info()

In [None]:
# Convert Pandas DataFrame to Dask DataFrame
# Dask was needed due to the complexity with the conversations and its ability to parallel
ddf = dd.from_pandas(df, npartitions=10)  # Adjust npartitions based on system capacity

# Enable progress bar to monitor the process
with ProgressBar():
    # Run describe on all columns, including text-heavy ones
    summary = ddf.describe(include='all').compute()

# Display the summary statistics
print(summary)

The dataset highlights brief interaction patterns and a diversity of responses, which are crucial for training models to understand and generate contextually appropriate dialogue. With a wide variety of genres and character dynamics represented, particularly from dramatic contexts, the model can be better equipped to manage nuanced and emotionally varied conversations, reflecting real-world scenarios.

In [None]:
# Step 1: Correct history length for true zero-length histories (conversation_history == [] and response is not empty)
df['history_length'] = df.apply(lambda row: 0 if row['conversation_history'] == [] and row['response'].strip() != '' else row['history_length'], axis=1)

# Step 2: Exclude entries where conversation history is ["START"]
filtered_df = df[df['conversation_history'].apply(lambda x: x != ["START"])]

# Step 3: Plot the distribution of conversation history lengths
plt.figure(figsize=(8, 6))
sns_plot = sns.countplot(x='history_length', data=filtered_df)

# Add counts on top of each bar
for p in sns_plot.patches:
    sns_plot.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                      ha='center', va='baseline', fontsize=10, color='black', xytext=(0, 5),
                      textcoords='offset points')

# Add labels and title
plt.title('Distribution of Conversation History Length (Excluding ["START"])')
plt.xlabel('History Length')
plt.ylabel('Count')

plt.show()





- 0 = 45
- 1 = 38011
- 2 = 17430
- 3 = 9400

4 and 5 are unkonwn due to the rolling window which spikes up 5 but 4 would be lower than 18150


The high counts at 0 and 1 history length suggest that many conversations are either starting fresh or rely on minimal context, indicating that the chatbot needs to be adept at handling short interactions. The high 0 is due to all conversation starting at 0. The 45 discrepancy between 0 and 1 show that there are very minimal conversations that have absolutely no context. The spike at 5 history length likely reflects a rolling window effect, where conversations are capped at the most recent five turns. This ensures the chatbot focuses on the latest dialogue, optimizing response generation without handling overly long histories.

In [None]:
# Combine character_1 and character_2 to find most frequent characters
character_counts = pd.concat([df['character_1'], df['character_2']]).value_counts().head(20)

plt.figure(figsize=(12, 6))
sns.barplot(x=character_counts.values, y=character_counts.index)
plt.title('Top 20 Most Frequent Characters')
plt.xlabel('Number of Lines')
plt.ylabel('Character Name')
plt.show()


Jack leads significantly with over 6,000 lines, followed by Joe and George, who both have under 4,000 lines. The distribution suggests that a few characters dominate the dataset, which may indicate that these characters are central to many conversations. The model will likely need to handle frequent interactions involving these high-volume characters, ensuring it can respond consistently and contextually to their dialogues.

In [None]:
# Convert the genre strings to actual lists
df['genre'] = df['genre'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Step 1: Explode the 'genre' column so each genre gets its own row
df_exploded = df.explode('genre')

# Step 2: Count the occurrences of each genre
#genre_counts = df_exploded['genre'].value_counts().head(20)  # Limit to top 20 genres
genre_counts = df_exploded['genre'].value_counts() # Limit to top 20 genres

# Step 3: Plot the genre distribution as a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette="coolwarm")
plt.title('Top 20 Genres by Number of Conversations')
plt.xlabel('Number of Conversations')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()


Drama, thriller, and comedy dominate the list, with drama leading by a significant margin, followed by thriller. These three genres make up the majority of conversations, indicating that the dataset skews toward emotionally intense and suspenseful dialogue. The model will need to adapt to different tones and conversation styles, especially handling serious and suspenseful conversations more frequently than others. Other genres like romance and crime also have a notable presence, adding further variety to the interactions.

In [None]:
# Add a column for response length
df['response_length'] = df['response'].apply(lambda x: len(x.split()))

plt.figure(figsize=(8, 6))
sns.scatterplot(x='history_length', y='response_length', data=df)
plt.title('History Length vs. Response Length')
plt.xlabel('History Length')
plt.ylabel('Response Length (words)')
plt.show()

# need to make sure this is words and not characters

This scatter plot shows the relationship between history length (number of previous turns) and response length (in words). Most responses, regardless of history length, tend to be concise, clustering below 100 words. However, there are outliers where responses exceed 300 or even 500 words, especially when the history length is 0 or 1. This suggests that while most responses are brief, some dialogues start with a long monologue or have more detailed responses, which may require the model to handle a wide range of response lengths effectively.

In [None]:
stop_words = set(stopwords.words('english'))

# Combine all responses into one string
all_responses = ' '.join(df['response'].tolist())

# Tokenize and remove stop words
words = [word.lower() for word in all_responses.split() if word.isalpha() and word.lower() not in stop_words]

word_counts = Counter(words).most_common(20)

# Convert to DataFrame for plotting
words_df = pd.DataFrame(word_counts, columns=['word', 'count'])

plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data=words_df)
plt.title('Top 20 Most Common Words in Responses')
plt.xlabel('Count')
plt.ylabel('Word')
plt.show()


This bar chart displays the Top 20 most common words found in responses after removing stopwords. Words like "like," "get," and "know" are the most frequent, indicating that the conversations often involve casual, everyday dialogue. Many of the words, such as "want," "think," and "going," suggest that a significant portion of the conversations are centered around decision-making and opinions. The frequency of these basic verbs and pronouns highlights the conversational nature of the dataset.

In [None]:

# Plotting the sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], bins=20, kde=True)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.axvline(x=0, color='red', linestyle='--')  # Add a vertical line at sentiment = 0 for reference
plt.show()


The data primarily deals with neutral dialogues. While strong emotions are present, they are less frequent, so the model should prioritize neutral sentiment understanding with occasional adjustments for emotional responses.

In [None]:
# Extract bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
X = vectorizer.fit_transform(df['response'])
bigram_counts = {word: count for word, count in zip(vectorizer.get_feature_names_out(), X.sum(axis=0).A1)}

# Get the top 20 bigrams
top_bigrams = Counter(bigram_counts).most_common(20)

# Convert to DataFrame for plotting
bigram_df = pd.DataFrame(top_bigrams, columns=['bigram', 'count'])

# Plot the top 20 bigrams
plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='bigram', data=bigram_df)
plt.title('Top 20 Bigrams in Responses')
plt.xlabel('Count')
plt.ylabel('Bigram')
plt.tight_layout()
plt.show()


The frequent appearance of these common phrases suggests that the dataset is filled with casual, informal speech, which is typical for movie dialogues. The model will need to handle such conversational patterns, particularly those involving negative expressions and common phrases about knowledge or intention.

In [None]:
# Reset the index after exploding to avoid issues with duplicate indices
df_exploded = df.explode('genre').reset_index(drop=True)

plt.figure(figsize=(18, 9))
sns.boxplot(x='genre', y='response_length', data=df_exploded)
plt.title('Response Length by Genre')
plt.xlabel('Genre')
plt.ylabel('Response Length (words)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


The model will need to handle both short responses (which dominate the dataset) and occasional long responses that vary by genre. The presence of long outliers in certain genres like drama and history suggests that more emotionally or narratively complex genres may require more detailed and nuanced responses.

In [None]:
# Combine all responses into one string
all_responses = ' '.join(df['response'].tolist())

# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_responses)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Responses')
plt.show()


This word cloud highlights the most common words in responses, with terms like "know," "think," and "want" standing out. These words suggest that many dialogues involve expressions of thought, desire, or action. The chatbot will need to handle everyday conversational language, focusing on these common themes in dialogue.

In [None]:
# Example of tagging one response
df['pos_tags'] = df['response'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))

# Count the frequency of each POS tag
pos_counts = Counter([tag for tags in df['pos_tags'] for _, tag in tags])

# Convert to DataFrame for plotting
pos_df = pd.DataFrame(pos_counts.items(), columns=['POS', 'count'])

# Plot the POS tag distribution
plt.figure(figsize=(18, 9))
sns.barplot(x='count', y='POS', data=pos_df)
plt.title('Part of Speech Tag Distribution')
plt.xlabel('Count')
plt.ylabel('POS Tag')
plt.tight_layout()
plt.show()

# Drop the 'pos_tags' column after plotting
df.drop(columns=['pos_tags'], inplace=True)


The **Part of Speech (POS) Tag Distribution** graph highlights the dominance of nouns (NN), verbs (VB), and pronouns (PRP) in the dataset, suggesting a strong focus on actions and entities in the dialogues. Proper nouns (NNP) also appear frequently, indicating a focus on specific names and entities. The diversity of POS tags, including adjectives, conjunctions, and determiners, reflects the varied linguistic structure in the conversations.

In [None]:
# Select numerical columns (including any other numeric features like sentiment, response length, etc.)
numerical_columns = ['history_length', 'response_length', 'sentiment']  # Add more columns as needed

# Compute the correlation matrix
corr_matrix = df[numerical_columns].corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5, vmin=-1, vmax=1)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()


The Feature Correlation Heatmap shows weak correlations between the features in the dataset: history length, response length, and sentiment. The correlation between history length and response length is minimal at 0.047, indicating that longer conversation histories do not significantly influence response lengths. The correlation between history length and sentiment is nearly negligible at 0.0087, suggesting that sentiment is independent of conversation length. Overall, the heatmap indicates that these features are largely uncorrelated.

### Summary of EDA

Our analysis of the Cornell Movie-Dialog Corpus revealed several important insights regarding the structure and content of the dataset. The **conversation history length** showed that most dialogues are short, with 1 turn exchanges dominating, but a significant portion also had a rolling window of up to 5 turns, indicating some complexity in multi-turn conversations. The **POS tag distribution** demonstrated that the dataset is rich in nouns, verbs, and proper nouns, reflecting a focus on specific entities and actions in the dialogues. Moreover, the **word frequency and bigram analyses** highlighted the conversational nature of the dataset, with common informal phrases like "don’t know" and "got it" dominating the responses.

The **sentiment distribution** revealed that most responses are neutral, with a concentration around 0, indicating that dialogues are not highly polarized. Meanwhile, the **response lengths** varied across genres but generally tended to be short, with occasional longer responses scattered across different genres. The **feature correlation analysis** showed weak or negligible correlations between history length, response length, and sentiment, suggesting that these factors operate independently in the dataset.

### Best GenAI Model for This Task:

Given the nature of the dataset—dominated by short, informal conversations with a range of context lengths—a **Seq2Seq model** such as **T5 (Text-to-Text Transfer Transformer)** would be well-suited for this task. The model’s flexibility in handling multi-turn dialogues and generating coherent responses makes it a strong choice for this dataset. T5's ability to handle text generation, alongside its capacity to condition on conversation history, will allow the chatbot to maintain contextual relevance across multiple turns. Additionally, fine-tuning T5 on this dataset would help the model understand the conversational dynamics and nuances, such as sentiment shifts and genre-specific responses.

In summary, the **T5 model** offers the right balance of flexibility, generative capacity, and contextual understanding, making it the ideal choice for developing a conversational AI chatbot based on the Cornell Movie-Dialog Corpus.

## T5 Modeling

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW
from torch.profiler import profile, record_function, ProfilerActivity

tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [None]:
def preprocess_data(conversation_history, response, sentiment):
    input_text = " ".join(conversation_history) + f" Sentiment: {sentiment}"
    target_text = response
    return input_text, target_text

In [None]:
df['input'], df['target'] = zip(*df.apply(lambda row: preprocess_data(row['conversation_history'], row['response'], row['sentiment']), axis=1))

# Split the data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
class ConversationDataset(Dataset):
  def __init__(self, tokenizer, df, max_length=512):
    self.tokenizer = tokenizer
    self.data = df
    self.max_length = max_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    input_text = self.data.iloc[idx]['input']
    target_text = self.data.iloc[idx]['target']

    # Tokenize input and target

    input_encodings = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
    target_encodings = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

    return {
        'input_ids': input_encodings['input_ids'].flatten(),
        'attention_mask': input_encodings['attention_mask'].flatten(),
        'labels': target_encodings['input_ids'].flatten()
    }

In [None]:
# Create datasets
train_dataset = ConversationDataset(tokenizer, train_df)
val_dataset = ConversationDataset(tokenizer, val_df)
test_dataset = ConversationDataset(tokenizer, test_df)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(device)


# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Gradient accumulation settings
accumulation_steps = 4  # Number of steps to accumulate before updating gradients

# Initialize GradScaler for mixed precision training
scaler = GradScaler(device)

# Scheduler to reduce learning rate if validation loss plateaus
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

# Training loop with gradient accumulation
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    start_time = time.time()  # Record start time of the epoch

    # Training loop with tqdm progress bar
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

    for i, batch in enumerate(progress_bar):
        optimizer.zero_grad()

        # Move tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass with autocast for mixed precision
        with autocast('cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / accumulation_steps  # Divide loss by accumulation steps

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()

        # Accumulate gradients and update weights after `accumulation_steps` mini-batches
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)  # Unscales gradients and performs optimizer step
            scaler.update()
            optimizer.zero_grad()

        running_loss += loss.item() * accumulation_steps  # Accumulate the original loss

        # Update progress bar with the current loss
        progress_bar.set_postfix({'loss': loss.item()})

    # Calculate time taken for the epoch
    epoch_time = time.time() - start_time
    avg_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} completed. Average loss: {avg_loss:.4f}. Time taken: {epoch_time:.2f} seconds.")

    # Validation after each epoch
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Validation Loss after Epoch {epoch + 1}: {avg_val_loss:.4f}")

    # Step the scheduler based on validation loss
    scheduler.step(avg_val_loss)



things i've test to make it faster
  * batch size has to be at 16 due to limitations of computation
  * implememented gradient accumulation to help cut time
  * Mixed precision training FP16
  * Reduce LR on Plateau
  * need to test torch profiler
  * A100 GPU works best
  * works with 48 Unites
  * added sentiment
  * added classification of multilabels


In [None]:
# Saving the model
model_save_path = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/model_w_sent_t-5base.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
# Testing phase
model.eval()
test_loss = 0.0
predictions = []
true_labels = []

start_time = time.time()  # Record start time for the evaluation

with torch.no_grad():
    # Use tqdm to monitor progress
    progress_bar = tqdm(test_dataloader, desc="Evaluating", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        test_loss += outputs.loss.item()

        # Store the predicted and true labels for further analysis
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_ids.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate average test loss
avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss:.4f}")

# # Convert predictions and true_labels back to text
# predicted_texts = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
# true_texts = [tokenizer.decode(true, skip_special_tokens=True) for true in true_labels]

# Calculate and print evaluation time
evaluation_time = time.time() - start_time
print(f"Evaluation completed in {evaluation_time:.2f} seconds.")



In [None]:
# Calculate perplexity
perplexity = torch.exp(torch.tensor(avg_test_loss))
print(f"Perplexity: {perplexity.item():.4f}")

In [None]:
# Needed for Bleu Rogue and more. Doing it separate to not mess with other test

# Start timing for the decoding process
start_time = time.time()

# Decode predicted texts
predicted_texts = []
for pred in tqdm(predictions, desc="Decoding Predicted Texts"):
    predicted_text = tokenizer.decode(pred, skip_special_tokens=True)
    predicted_texts.append(predicted_text)

# Decode true texts
true_texts = []
for true in tqdm(true_labels, desc="Decoding True Texts"):
    true_text = tokenizer.decode(true, skip_special_tokens=True)  # No need for double brackets here
    true_texts.append(true_text)

# Calculate and print total decoding time
total_decoding_time = time.time() - start_time
print(f"Decoding completed in {total_decoding_time:.2f} seconds.")


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Start timing for the BLEU score calculation
start_time = time.time()

# Initialize a list to store BLEU scores for each prediction
bleu_scores = []

# Create an instance of the smoothing function
smoothing_function = SmoothingFunction()

# Calculate BLEU scores with smoothing using unigram (1-gram) only
for pred_text, true_text in tqdm(zip(predicted_texts, true_texts), total=len(predicted_texts), desc="Calculating BLEU Score"):
    score = sentence_bleu([[true_text]], pred_text.split(), weights=(1, 0, 0, 0), smoothing_function=smoothing_function.method1)  # Unigram only
    bleu_scores.append(score)

# Calculate average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

# Print results
print(f"Average BLEU Score (Unigram): {average_bleu_score:.4f}")

# Calculate and print total BLEU score calculation time
total_bleu_time = time.time() - start_time
print(f"BLEU score calculation completed in {total_bleu_time:.2f} seconds.")


In [None]:
for i in range(10):
    pred_tokens = set(predicted_texts[i].split())
    true_tokens = set(true_texts[i].split())
    overlap = pred_tokens.intersection(true_tokens)
    print(f"Predicted Tokens: {pred_tokens}")
    print(f"True Tokens: {true_tokens}")
    print(f"Overlap: {overlap}")
    print("---")


In [None]:
pip install rouge

In [None]:
from rouge import Rouge

# Initialize ROUGE scorer
rouge = Rouge()

# Start timing for the ROUGE score calculation
start_time = time.time()

# Initialize lists to store ROUGE scores
rouge_scores = []

# Calculate ROUGE scores
for pred_text, true_text in tqdm(zip(predicted_texts, true_texts), total=len(predicted_texts), desc="Calculating ROUGE Score"):
    score = rouge.get_scores(pred_text, true_text, avg=True)  # Calculate ROUGE scores
    rouge_scores.append(score)

# Convert the list of scores into a summary
average_rouge = {
    'rouge-1': sum(score['rouge-1']['f'] for score in rouge_scores) / len(rouge_scores),
    'rouge-2': sum(score['rouge-2']['f'] for score in rouge_scores) / len(rouge_scores),
    'rouge-l': sum(score['rouge-l']['f'] for score in rouge_scores) / len(rouge_scores),
}

# Print results
print("Average ROUGE Scores:")
print(f"ROUGE-1: {average_rouge['rouge-1']:.4f}")
print(f"ROUGE-2: {average_rouge['rouge-2']:.4f}")
print(f"ROUGE-L: {average_rouge['rouge-l']:.4f}")

# Calculate and print total ROUGE score calculation time
total_rouge_time = time.time() - start_time
print(f"ROUGE score calculation completed in {total_rouge_time:.2f} seconds.")


In [None]:
from nltk.translate.meteor_score import meteor_score
nltk.download('wordnet')

# Start timing for the METEOR score calculation
start_time = time.time()

# Initialize a list to store METEOR scores
meteor_scores = []

# Calculate METEOR scores
for pred_text, true_text in tqdm(zip(predicted_texts, true_texts), total=len(predicted_texts), desc="Calculating METEOR Score"):
    # Tokenize the texts
    pred_tokens = pred_text.split()  # Tokenize predicted text
    true_tokens = true_text.split()   # Tokenize true text

    # Calculate METEOR score
    score = meteor_score([true_tokens], pred_tokens)  # Wrap true_tokens in a list for references
    meteor_scores.append(score)

# Calculate average METEOR score
average_meteor_score = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

# Print results
print(f"Average METEOR Score: {average_meteor_score:.4f}")

# Calculate and print total METEOR score calculation time
total_meteor_time = time.time() - start_time
print(f"METEOR score calculation completed in {total_meteor_time:.2f} seconds.")



In [None]:

# Set the model to evaluation mode and move it to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Define a function to generate responses
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)  # Move input_ids to the same device as the model

    with torch.no_grad():  # Disable gradient calculation for inference
        output = model.generate(input_ids, max_length=50, num_return_sequences=1)  # Adjust max_length as needed

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Continuous interaction loop
print("Chatbot is ready! Type 'exit' to end the conversation.")
while True:
    input_text = input("You: ")  # Get user input
    if input_text.lower() == 'exit':
        print("Ending conversation.")
        break  # Exit the loop if the user types 'exit'

    generated_response = generate_response(input_text)  # Generate a response
    print(f"Chatbot: {generated_response}")  # Print the response
