In [1]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import csv
import pandas as pd

# Load the JSON data
with open('RickDialogues_modified_2.json', 'r') as f:
    data = json.load(f)

# Prepare the data
conversations = data['conversations']

# Separate the conversations into human and gpt inputs
human_inputs = []
gpt_outputs = []

for i in range(len(conversations) - 1):
    if conversations[i]['from'] == 'human' and conversations[i + 1]['from'] == 'gpt':
        human_inputs.append(conversations[i]['value'])
        gpt_outputs.append(conversations[i + 1]['value'])

# Create a dataset suitable for fine-tuning
train_data = [f"Human: {h}\nGPT: {g}" for h, g in zip(human_inputs, gpt_outputs)]
dataset = Dataset.from_dict({'text': train_data})

# Load the tokenizer and model
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Assign EOS token as PAD token
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=30,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=5000,
    fp16=True,  # Enable mixed precision training if supported by GPU
    learning_rate=5e-5
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("fine-tuned-gpt2")
tokenizer.save_pretrained("fine-tuned-gpt2")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 2108/2108 [00:00<00:00, 2257.98 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,3.2444,3.027454
2,3.0668,2.742224
3,2.7717,2.5193
4,2.6805,2.30962
5,2.4639,2.118656
6,2.3321,1.931975
7,2.2309,1.76058
8,2.0189,1.616752
9,1.9327,1.483811
10,1.8969,1.364288


('fine-tuned-gpt2/tokenizer_config.json',
 'fine-tuned-gpt2/special_tokens_map.json',
 'fine-tuned-gpt2/vocab.json',
 'fine-tuned-gpt2/merges.txt',
 'fine-tuned-gpt2/added_tokens.json')

In [2]:
# Load the fine-tuned model and tokenizer
model_name = "fine-tuned-gpt2"

def load_model_and_tokenizer(model_name):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def generate_response(prompt, model, tokenizer, max_length=256, num_return_sequences=1):
    inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def truncate_response(response, word_limit=150):
    words = response.split()
    if len(words) > word_limit:
        return ' '.join(words[:word_limit])
    return response

# Read questions from a CSV file
input_csv = 'filled_answers_finetunebase.csv'
output_csv = 'filled_answers_distilgpt2_after_finetuning_20.csv'

questions = pd.read_csv(input_csv)

# Create an empty DataFrame to store id, questions and responses
results = pd.DataFrame(columns=['id', 'question', 'answer'])

# Iterate over questions, generate responses, and store them
for index, row in questions.iterrows():
    prompt = f"Human: {row['question']}\nGPT:"
    model, tokenizer = load_model_and_tokenizer(model_name)  # Re-initialize the model and tokenizer
    response = generate_response(prompt, model, tokenizer)
    response_cleaned = response.replace(f"Human: {row['question']}\nGPT: ", "")  # Remove the 'Human:' and 'GPT:' parts
    truncated_response = truncate_response(response_cleaned, word_limit=150)
    new_row = pd.DataFrame({'id': [row['id']], 'question': [row['question']], 'answer': [truncated_response]})
    results = pd.concat([results, new_row], ignore_index=True)
    print(index, truncated_response)

# Ensure the results are in the same order as the original input
results.sort_values(by='id', inplace=True)

# Save the results to a new CSV file
results.to_csv(output_csv, index=False)
print(f'Responses saved to {output_csv}')

0 Uh, what's the difference? What's the difference? Amphetetron? What's the difference? Oxycontin? Amphetetron?  Oxygen? See, you're doing interdimensional stuff.  Yeah, look at that dingleberry commercial. I said, "Look at this. This will increase the chance for you to become an improviser." Look at that.  Ooh, boy.  Wow. Wow.  Wow. Wow, huh.  Wow, huh.  Wow.  Wow, huh.  Wow, huh.  Holy shit.  Holy crap.  Holy shit.  I said it loud and clear, Morty.  Morty, what's the difference? I mean, you need to be less motivated to make your stuff. Your stuff is gonna get less and less and less used to it. It's just a little bit of a hassle. I mean, what's the difference? I need to go through a whole bunch of interdimensional design and conceptual stuff, and then go through a bunch of interdimensional nonsense, Morty. It's just stuff I needed to do
1 *adjusts his own headset as he stands in the garage* Well, you're almost there, Morty. What do you say, Morty? Well, take his portal gun, Morty! I?'