In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set the file paths in Google Drive
movie_conversations_path = '/content/drive/My Drive/Colab Notebooks/Cornell/movie_conversations.txt'
movie_lines_path = '/content/drive/My Drive/Colab Notebooks/Cornell/movie_lines.txt'

# Function to load the movie lines from the movie_lines.txt file
def load_movie_lines(movie_lines_path):
    lines = {}
    # Each line in the file has the following format: lineID +++$+++ characterID +++$+++ movieID +++$+++ character name +++$+++ text
    with open(movie_lines_path, encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            if len(parts) == 5:
                line_id, text = parts[0], parts[4].strip()  # lineID and actual text of the line
                lines[line_id] = text
    return lines

# Function to load the conversation structure from the movie_conversations.txt file
def load_conversations(movie_conversations_path):
    conversations = []
    # Each line has the following format: characterID1 +++$+++ characterID2 +++$+++ movieID +++$+++ ['lineID1','lineID2',..., 'lineIDN']
    with open(movie_conversations_path, encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            if len(parts) == 4:
                line_ids_str = parts[3].strip()  # Contains the line IDs in a string format
                line_ids = eval(line_ids_str)  # Convert string to actual list of line IDs
                conversations.append(line_ids)
    return conversations

# Function to create input-response pairs for chatbot training from the conversations and lines
def create_conversation_pairs(lines, conversations):
    conversation_pairs = []
    for conv in conversations:
        for i in range(len(conv) - 1):
            input_line = lines.get(conv[i], "")  # Get the input line
            output_line = lines.get(conv[i + 1], "")  # Get the response line
            if input_line and output_line:  # Only add pairs if both exist
                conversation_pairs.append((input_line, output_line))
    return conversation_pairs

# Function to save conversation pairs to a CSV file
def save_conversation_pairs(conversation_pairs, output_path='/content/drive/My Drive/Colab Notebooks/Cornell/conversation_pairs.csv'):
    import pandas as pd
    df = pd.DataFrame(conversation_pairs, columns=['input', 'response'])
    df.to_csv(output_path, index=False)
    print(f"Saved {len(conversation_pairs)} conversation pairs to {output_path}")

# Load movie lines and conversations from Google Drive
lines = load_movie_lines(movie_lines_path)
conversations = load_conversations(movie_conversations_path)

# Create conversation pairs
conversation_pairs = create_conversation_pairs(lines, conversations)

# Save the conversation pairs to a CSV file in Google Drive
save_conversation_pairs(conversation_pairs)

print(f"Preprocessing complete! Total pairs: {len(conversation_pairs)}")


Mounted at /content/drive
Saved 221282 conversation pairs to /content/drive/My Drive/Colab Notebooks/Cornell/conversation_pairs.csv
Preprocessing complete! Total pairs: 221282


In [4]:
# Install required libraries
!pip install transformers datasets

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Load the conversation pairs from Google Drive
conversation_pairs_path = '/content/drive/My Drive/Colab Notebooks/Cornell/conversation_pairs.csv'
conversation_df = pd.read_csv(conversation_pairs_path)

# Preview the data
conversation_df.head()

# Combine input and response into a single training format
def format_conversations(df):
    formatted_data = []
    for i in range(len(df)):
        input_text = df.loc[i, 'input']
        response_text = df.loc[i, 'response']
        # Join input and response for training as one block of conversation
        formatted_data.append(f"User: {input_text}\nBot: {response_text}\n")
    return formatted_data

# Format the conversations for training
formatted_conversations = format_conversations(conversation_df)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"text": formatted_conversations})

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add a padding token (set it to the EOS token)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset and set 'labels' for loss computation
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    # Set labels to be the same as input_ids, which GPT-2 uses for predicting the next word
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="/content/drive/My Drive/Colab Notebooks/Cornell/logs",
    logging_steps=500,
    fp16=True,  # Enable mixed precision training
)

# Trainer class for fine-tuning GPT-2
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the model to Google Drive after training
model.save_pretrained('/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot')

# Save the tokenizer as well
tokenizer.save_pretrained('/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Map:   0%|          | 0/221282 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.421
1000,0.2139
1500,0.214
2000,0.2138
2500,0.2105
3000,0.2108
3500,0.2108
4000,0.2121
4500,0.2059
5000,0.2078


('/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot/tokenizer_config.json',
 '/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot/special_tokens_map.json',
 '/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot/vocab.json',
 '/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot/merges.txt',
 '/content/drive/My Drive/Colab Notebooks/Cornell/gpt2-chatbot/added_tokens.json')