In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.6 MB/s[0m eta [36m0:00:0

In [6]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import corpus_bleu
from torch.nn.functional import cross_entropy
from sklearn.model_selection import train_test_split
import numpy as np



In [12]:

# Load and preprocess the dataset
file_path = '/content/drive/MyDrive/genre.csv'
data = pd.read_csv(file_path)
data = data.dropna()
conversations = data['text'].tolist()


In [13]:
# Split the dataset into train and test sets
train_conversations, test_conversations = train_test_split(conversations, test_size=0.2, random_state=42)

# Pre-trained model and tokenizer
model_name = 'gpt2-medium'
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=50256)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


In [14]:
# Generate a single response
def generate_response(input_text, max_length=100, temperature=1.0, top_k=50):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    attention_mask = torch.ones_like(input_ids)
    with torch.no_grad():
        output = model.generate(input_ids, attention_mask=attention_mask,
                                max_length=max_length, temperature=temperature,
                                top_k=top_k)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response


In [15]:
# post-processing
def post_process_response(response):
    # Filter out overly long responses
    response = " ".join(response.split()[:50])

    # Capitalize the first letter
    response = response.capitalize()

    return response

In [18]:

# Store reference responses and bot responses
reference_responses = []
bot_responses = []
train_references = []
train_hypotheses = []
test_references = []
test_hypotheses = []

# Initialize empty lists to store training and testing input IDs
train_input_ids = []
test_input_ids = []

def simulate_conversation(conversations_list, references_list, hypotheses_list):
    # Iterate through conversations and generate responses
    for conversation in conversations_list:
        context_window = []

        while True:
            user_input = input("You: ")  # Take user input

            if user_input.strip().lower() == 'x':
                break

            context_window.append("You: " + user_input)
            if len(context_window) > 5:
                context_window.pop(0)

            prompt = context_window[-1][4:]
            response = generate_response(prompt, max_length=150, temperature=0.8, top_k=50)
            response_sentence = response.split('.')[0]
            response_sentence = post_process_response(response_sentence)

            print("Bot:", response_sentence.strip())

            # Append to references and hypotheses lists
            references_list.append(user_input)
            hypotheses_list.append(response_sentence)
            reference_responses.append(response_sentence)
            bot_responses.append(response_sentence)

        exit_loop_input = input("Enter 'exit' to stop iterating through conversations, or press Enter to continue: ")
        if exit_loop_input.strip().lower() == 'exit':
            print("Exiting conversation loop.")
            break


simulate_conversation(train_conversations, train_references, train_hypotheses)
simulate_conversation(test_conversations, test_references, test_hypotheses)

# Calculate BLEU scores
references = [[ref.split()] for ref in reference_responses]
hypotheses = [response.split() for response in bot_responses]

bleu_score = corpus_bleu(references, hypotheses)
print(f"Overall BLEU Score: {bleu_score:.4f}")

You: fact
Bot: Fact that the government has been able to keep the public in the dark about the extent of the nsa's surveillance programs is a testament to the government's ability to keep the public in the dark about its own activities
You: back
Bot: Back to the beginning of the game
You: option
Bot: Option to use the same password for both accounts
You: train
Bot: Train
You: actual target
Bot: Actual target
You: take care
Bot: Take care of the rest
You: ensure the
Bot: Ensure the safety of the public
You: I apologize
Bot: I apologize for the delay in posting this, but i have been working on a new project and i have been unable to get it done
You: Let's try
Bot: Let's try to get a sense of what's going on here
You: seems to
Bot: Seems to be a very good thing
You: which is
Bot: Which is a very good thing
You: This is the
Bot: This is the first time that the government has been able to use the power of the purse to force a company to pay back money it owes
You: has been able
Bot: Has bee

In [19]:

# Calculate Cross-Entropy Loss for Training and Test Set Responses
def calculate_loss(logits, target_ids):
    loss = torch.nn.functional.cross_entropy(logits, target_ids, reduction='sum')
    return loss

total_train_loss = 0.0
num_train_tokens = 0

total_test_loss = 0.0
num_test_tokens = 0

# For both training and test sets
for references, hypotheses in zip([train_references, test_references], [train_hypotheses, test_hypotheses]):
    for reference, hypothesis in zip(references, hypotheses):
        reference_ids = [tokenizer.convert_tokens_to_ids(ref) for ref in reference.split()]
        hypothesis_ids = [tokenizer.convert_tokens_to_ids(hyp) for hyp in hypothesis.split()]

        # Ensure that reference and hypothesis have the same length
        min_length = min(len(reference_ids), len(hypothesis_ids))
        reference_ids = reference_ids[:min_length]
        hypothesis_ids = hypothesis_ids[:min_length]

        reference_tensor = torch.tensor(reference_ids).unsqueeze(0)
        hypothesis_tensor = torch.tensor(hypothesis_ids).unsqueeze(0)

        # Get logits
        reference_logits = model(input_ids=reference_tensor).logits.squeeze(0)
        hypothesis_logits = model(input_ids=hypothesis_tensor).logits.squeeze(0)

        # Calculate loss for both reference and hypothesis
        train_loss = calculate_loss(reference_logits, torch.tensor(hypothesis_ids))
        test_loss = calculate_loss(hypothesis_logits, torch.tensor(hypothesis_ids))

        total_train_loss += train_loss.item()
        total_test_loss += test_loss.item()

        num_train_tokens += len(hypothesis_ids)
        num_test_tokens += len(hypothesis_ids)

average_train_loss = total_train_loss / num_train_tokens
average_test_loss = total_test_loss / num_test_tokens

print(f"Average Cross-Entropy Loss (Training Set): {average_train_loss:.4f}")
print(f"Average Cross-Entropy Loss (Test Set): {average_test_loss:.4f}")


Average Cross-Entropy Loss (Training Set): 10.4506
Average Cross-Entropy Loss (Test Set): 10.1373


In [20]:
# Save the trained model
output_dir = '/content/drive/MyDrive/NLP_trained_chatbot_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Trained model and tokenizer saved to {output_dir}")

Trained model and tokenizer saved to /content/drive/MyDrive/NLP_trained_chatbot_model
