In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

# Load the preprocessed dataset
file_path = '/content/drive/MyDrive/WGAN_Dataset/preprocessed_chat_data_part_1.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Check the overall structure of the dataset
print(data.info())

Mounted at /content/drive
                                       human_message  \
0  I have been feeling so sad and overwhelmed lat...   
1  I recently got a promotion at work which I tho...   
2  Well the workload has increased significantly ...   
3  I have been trying to prioritize my tasks and ...   
4  Youre right. I have not really opened up about...   

                                           gpt_reply  
0  Hey there I am here to listen and support you....  
1  I can understand how it can be overwhelming wh...  
2  It sounds like youre dealing with a lot of pre...  
3  Its great to hear that youre already implement...  
4  Its completely normal to feel that way but rem...  
human_message    0
gpt_reply        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 807085 entries, 0 to 807084
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   human_message  807085 non-null  object
 1   gpt_reply

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the preprocessed dataset
file_path = '/content/drive/MyDrive/WGAN_Dataset/preprocessed_chat_data_part_1.csv'
data = pd.read_csv(file_path)

# Take a random sample of 10,000 rows
subset_data = data.sample(n=10000, random_state=42)

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the data: Tokenization and Dataset Preparation
def preprocess_data(human_message, gpt_reply):
    inputs = tokenizer(human_message, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    outputs = tokenizer(gpt_reply, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply preprocessing to the subset data
processed_data = subset_data.apply(lambda row: preprocess_data(row['human_message'], row['gpt_reply']), axis=1)

# Convert processed data into a list of dictionaries suitable for PyTorch Dataset
dataset = [{'input_ids': data['input_ids'].squeeze(), 'labels': data['labels'].squeeze()} for data in processed_data]

# Split the data into training and evaluation sets
train_size = int(0.9 * len(dataset))
train_dataset, eval_dataset = dataset[:train_size], dataset[train_size:]

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Fine-tune the model
trainer.train()




Step,Training Loss
10,12.5601
20,9.7716
30,3.9378
40,1.4765
50,1.1896
60,0.9723
70,0.8556
80,0.7081
90,0.7228
100,0.8192


TrainOutput(global_step=13500, training_loss=0.5774279793280143, metrics={'train_runtime': 5002.8445, 'train_samples_per_second': 5.397, 'train_steps_per_second': 2.698, 'total_flos': 7054884864000000.0, 'train_loss': 0.5774279793280143, 'epoch': 3.0})

In [None]:
from rouge_score import rouge_scorer
import torch

# Continue with the ROUGE-L evaluation from the previous code provided
rouge_l_score = calculate_rouge_l(predictions, references)
print(f"Final ROUGE-L Score: {rouge_l_score}")



Final ROUGE-L Score: 0.13722898868952038


In [None]:
import bert_score


# Calculate BERT scores
P, R, F1 = bert_score.score(predictions, references, lang="en", model_type="bert-base-uncased")
average_f1 = F1.mean().item()

print(f"BERT Score (Precision): {P.mean().item()}")
print(f"BERT Score (Recall): {R.mean().item()}")
print(f"BERT Score (F1): {average_f1}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Score (Precision): 0.4932711124420166
BERT Score (Recall): 0.49489626288414
BERT Score (F1): 0.4935985505580902


In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Define training arguments with early stopping
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=17,  # Set a higher number of epochs, early stopping will prevent overfitting
    per_device_train_batch_size=4,  # Adjust batch size if needed
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,  # Save only the last 2 checkpoints
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Define the early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Number of epochs to wait for improvement before stopping
    early_stopping_threshold=0.01  # Minimum change to qualify as improvement
)

# Fine-tune the model again with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start fine-tuning
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.4792,0.582789
2,0.5287,0.566611
3,0.4721,0.576621
4,0.4923,0.587723


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=9000, training_loss=0.5015449976921081, metrics={'train_runtime': 5899.7413, 'train_samples_per_second': 25.933, 'train_steps_per_second': 6.483, 'total_flos': 9406513152000000.0, 'train_loss': 0.5015449976921081, 'epoch': 4.0})

In [None]:
from rouge_score import rouge_scorer
import torch

# Continue with the ROUGE-L evaluation from the previous code provided
rouge_l_score = calculate_rouge_l(predictions, references)
print(f"Final ROUGE-L Score: {rouge_l_score}")



Final ROUGE-L Score: 0.13722898868952038


In [None]:
import bert_score


# Calculate BERT scores
P, R, F1 = bert_score.score(predictions, references, lang="en", model_type="bert-base-uncased")
average_f1 = F1.mean().item()

print(f"BERT Score (Precision): {P.mean().item()}")
print(f"BERT Score (Recall): {R.mean().item()}")
print(f"BERT Score (F1): {average_f1}")


BERT Score (Precision): 0.4932711124420166
BERT Score (Recall): 0.49489626288414
BERT Score (F1): 0.4935985505580902


In [None]:
# Load the preprocessed dataset
file_path = '/content/drive/MyDrive/WGAN_Dataset/preprocessed_chat_data_part_1.csv'
data = pd.read_csv(file_path)

# Take a random sample of 25,000 rows
subset_data = data.sample(n=25000, random_state=42)


In [None]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the data: Tokenization and Dataset Preparation
def preprocess_data(human_message, gpt_reply):
    inputs = tokenizer(human_message, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    outputs = tokenizer(gpt_reply, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply preprocessing to the subset data
processed_data = subset_data.apply(lambda row: preprocess_data(row['human_message'], row['gpt_reply']), axis=1)

# Convert processed data into a list of dictionaries suitable for PyTorch Dataset
dataset = [{'input_ids': data['input_ids'].squeeze(), 'labels': data['labels'].squeeze()} for data in processed_data]

# Split the data into training and evaluation sets
train_size = int(0.9 * len(dataset))
train_dataset, eval_dataset = dataset[:train_size], dataset[train_size:]

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Define training arguments with early stopping
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=17,  # Set a higher number of epochs, early stopping will prevent overfitting
    per_device_train_batch_size=4,  # Adjust batch size if needed
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir='./logs',
    logging_steps=20,
    save_total_limit=2,  # Save only the last 2 checkpoints
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Define the early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Number of epochs to wait for improvement before stopping
    early_stopping_threshold=0.01  # Minimum change to qualify as improvement
)

# Fine-tune the model again with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start fine-tuning
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.5594,0.550932
2,0.5688,0.545408
3,0.5801,0.543731


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=16875, training_loss=0.568894016011556, metrics={'train_runtime': 10946.6007, 'train_samples_per_second': 34.942, 'train_steps_per_second': 8.736, 'total_flos': 1.763721216e+16, 'train_loss': 0.568894016011556, 'epoch': 3.0})

In [None]:
from rouge_score import rouge_scorer
import torch

# Continue with the ROUGE-L evaluation from the previous code provided
rouge_l_score = calculate_rouge_l(predictions, references)
print(f"Final ROUGE-L Score: {rouge_l_score}")



Final ROUGE-L Score: 0.13722898868952038


In [None]:
import bert_score


# Calculate BERT scores
P, R, F1 = bert_score.score(predictions, references, lang="en", model_type="bert-base-uncased")
average_f1 = F1.mean().item()

print(f"BERT Score (Precision): {P.mean().item()}")
print(f"BERT Score (Recall): {R.mean().item()}")
print(f"BERT Score (F1): {average_f1}")


BERT Score (Precision): 0.4932711124420166
BERT Score (Recall): 0.49489626288414
BERT Score (F1): 0.4935985505580902


In [None]:
# Load the preprocessed dataset
file_path = '/content/drive/MyDrive/WGAN_Dataset/preprocessed_chat_data_part_1.csv'
data = pd.read_csv(file_path)

# Take a random sample of 25,000 rows
subset_data = data.sample(n=55000, random_state=42)


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the data: Tokenization and Dataset Preparation
def preprocess_data(human_message, gpt_reply):
    inputs = tokenizer(human_message, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    outputs = tokenizer(gpt_reply, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply preprocessing to the subset data
processed_data = subset_data.apply(lambda row: preprocess_data(row['human_message'], row['gpt_reply']), axis=1)

# Convert processed data into a list of dictionaries suitable for PyTorch Dataset
dataset = [{'input_ids': data['input_ids'].squeeze(), 'labels': data['labels'].squeeze()} for data in processed_data]

# Split the data into training and evaluation sets
train_size = int(0.9 * len(dataset))
train_dataset, eval_dataset = dataset[:train_size], dataset[train_size:]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from rouge_score import rouge_scorer
import torch

# Define training arguments with early stopping
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,  # Set a higher number of epochs, early stopping will prevent overfitting
    per_device_train_batch_size=8,  # Adjust batch size if needed
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir='./logs',
    logging_steps=20,
    save_total_limit=2,  # Save only the last 2 checkpoints
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Define the early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Number of epochs to wait for improvement before stopping
    early_stopping_threshold=0.01  # Minimum change to qualify as improvement
)

# Fine-tune the model again with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start fine-tuning
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.5673,0.550792
2,0.5847,0.54601
3,0.5204,0.544295


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=67500, training_loss=0.5601995306509513, metrics={'train_runtime': 13001.1352, 'train_samples_per_second': 34.612, 'train_steps_per_second': 34.612, 'total_flos': 1.763721216e+16, 'train_loss': 0.5601995306509513, 'epoch': 3.0})

In [None]:
from rouge_score import rouge_scorer
import bert_score

# Define a function to calculate ROUGE-L score
def calculate_rouge_l(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = []
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)['rougeL'].fmeasure
        scores.append(score)
    return sum(scores) / len(scores)

# Generate predictions and references
predictions = []
references = []

# Assuming eval_dataset is your validation dataset
for data in eval_dataset:
    input_ids = data['input_ids'].unsqueeze(0).to(model.device)  # Add batch dimension and move to model device
    attention_mask = data['input_ids'].ne(tokenizer.pad_token_id).unsqueeze(0).to(model.device)  # Create attention mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    reference = tokenizer.decode(data['labels'], skip_special_tokens=True)
    predictions.append(prediction)
    references.append(reference)

# Calculate ROUGE-L score
rouge_l_score = calculate_rouge_l(predictions, references)
print(f"Final ROUGE-L Score: {rouge_l_score}")

# Calculate BERT scores
P, R, F1 = bert_score.score(predictions, references, lang="en", model_type="bert-base-uncased")
average_f1 = F1.mean().item()

print(f"BERT Score (Precision): {P.mean().item()}")
print(f"BERT Score (Recall): {R.mean().item()}")
print(f"BERT Score (F1): {average_f1}")


Final ROUGE-L Score: 0.13940553705501144


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Score (Precision): 0.4963013529777527
BERT Score (Recall): 0.4960973560810089
BERT Score (F1): 0.4957442879676819


In [None]:
# Specify the path where you want to save the model and tokenizer
save_path = '/content/drive/MyDrive/WGAN_Dataset/fine_tuned_gpt2'

# Save the fine-tuned model
model.save_pretrained(save_path)

# Save the tokenizer
tokenizer.save_pretrained(save_path)


('/content/drive/MyDrive/WGAN_Dataset/fine_tuned_gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/WGAN_Dataset/fine_tuned_gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/WGAN_Dataset/fine_tuned_gpt2/vocab.json',
 '/content/drive/MyDrive/WGAN_Dataset/fine_tuned_gpt2/merges.txt',
 '/content/drive/MyDrive/WGAN_Dataset/fine_tuned_gpt2/added_tokens.json')