In [None]:
!pip install trl==0.10.1

In [None]:
!pip install -U bitsandbytes

In [None]:
import torch
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    AutoModelForSequenceClassification
)

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

from peft import LoraConfig, get_peft_model

from datasets import Dataset

import zipfile
import os

import pickle

import random

import pandas as pd

In [None]:
import pickle


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#  LoRA configuration
model_name = "microsoft/Phi-3-mini-128k-instruct"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16,)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map='auto')
lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["qkv_proj", "o_proj"], lora_dropout=0.1, bias="none", task_type="CAUSAL_LM",)
model = get_peft_model(model, lora_config)

# Wrap the model for PPO (adds a value head)
model = AutoModelForCausalLMWithValueHead.from_pretrained(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



In [None]:
# Chemin du fichier zip
zip_path = '/content/humor_model.zip'

# Répertoire de destination pour le dézipper
extract_dir = '/content/humor_model'

# Décompresser le fichier
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Le modèle a été décompressé dans {extract_dir}")

# Charger le tokenizer et le modèle
reward_tokenizer = AutoTokenizer.from_pretrained(extract_dir)
reward_model = AutoModelForSequenceClassification.from_pretrained(extract_dir)
reward_model.eval()  # Mettre en mode évaluation

reward_model.to(device)  # Déplacer le modèle sur le bon device

print("Modèle de récompense chargé avec succès.")

Le modèle a été décompressé dans /content/humor_model
Modèle de récompense chargé avec succès.


In [None]:
def reward_fn(texts):
    inputs = reward_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = reward_model(**inputs)
    # Extract the logits for the second class (assuming binary classification)
    rewards = outputs.logits[:, 1].cpu().detach()  # Take the second column of logits
    return rewards

In [None]:
reward_fn(['This is a non funny prompt.', 'This is a funny prompt.'])

tensor([-0.4482, -2.2881])

In [None]:
import copy

# Create a deep copy of the model for ref_policy
ref_model = copy.deepcopy(model)

In [None]:
ppo_config = {"mini_batch_size": 1, "batch_size": 1, 'learning_rate': 1e-4}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer)



In [None]:
with open('/content/gpt_dataset.txt', 'rb') as file:
    # read lines from .txt file
    data = file.readlines()
data = [d.decode('utf-8').strip().replace('"', '').replace("'", "")[:-1] for d in data]
#random.shuffle(data)
len_data = len(data)

In [None]:
data_dict = {"query": data}
train_dataset = Dataset.from_dict(data_dict)

In [None]:
# Training Loop
rewards_list = []
loss_list = []

template = """<|system|>
You are a funny assistant. Answer with a short sentence.<|end|>
<|user|>
?<|end|>
<|assistant|>"""

for epoch, sample in enumerate(train_dataset):
    query_txt = template.replace('?', sample["query"])
    print(sample["query"])
    query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)

    generation_kwargs = {
        "min_length": -1,
        "top_k": 0.0,
        "top_p": 1.0,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "max_new_tokens": 64,
    }

    try:
        response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs)
        response_txt = tokenizer.decode(response_tensor[0])
        print(response_txt)
    except RuntimeError as e:
        print(f"Error on epoch {epoch}: {e}")
        torch.cuda.empty_cache()
        # Skip this iteration and continue to the next one
        continue

    # Compute reward
    reward = [reward_fn(response_txt)]
    rewards_list.append(reward[0].item())

    # Train with PPO step
    stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

    # Track PPO loss and rewards
    loss = stats["ppo/loss/total"]
    reward_mean = stats["ppo/returns/mean"]

    print(f"Epoch: {epoch}, Loss: {loss:.4f}, Mean Reward: {reward_mean:.4f}")
    loss_list.append(loss)

    # Save model every 250 epochs
    if (epoch + 1) % (250) == 0:
        model_dir = f"model_epoch_{epoch + 1}"
        # Ensure directory exists
        os.makedirs(model_dir, exist_ok=True)
        print(f"Saving model at epoch {epoch + 1}...")
        model.save_pretrained(model_dir)
        tokenizer.save_pretrained(model_dir)

# Summary
print("Training complete.")
print("Tracked Losses:", loss_list)
print("Tracked Rewards:", rewards_list)

In [None]:
model_dir = f"model_final"
# Ensure directory exists
os.makedirs(model_dir, exist_ok=True)
print(f"Saving model at final.")
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

Saving model at final.


('model_final/tokenizer_config.json',
 'model_final/special_tokens_map.json',
 'model_final/tokenizer.model',
 'model_final/added_tokens.json',
 'model_final/tokenizer.json')

In [None]:
import shutil
import os
from google.colab import files

# Zip the directory
zip_file = f"model_final.zip"
shutil.make_archive(zip_file.replace(".zip", ""), 'zip', model_dir)
files.download(zip_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df = pd.DataFrame()
df['loss'] = loss_list
df['rewards'] = rewards_list
df.to_csv('training_output.csv')

In [None]:
import matplotlib.pyplot as plt

plt.title('Loss and reward over training')
plt.plot(loss_list, label='PPO Loss')
plt.plot(rewards_list, label='Reward')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()
plot_filename = 'loss_and_reward_plot.png'
plt.savefig(plot_filename)
plt.show()


In [None]:
def average_every_n_points(data, n):
    return [sum(data[i:i+n])/n for i in range(0, len(data), n)]

n = len(loss_list)//min(len(loss_list), 10)
averaged_loss = average_every_n_points(loss_list, n)
averaged_rewards = average_every_n_points(rewards_list, n)

plt.title('Loss and reward averaged every ' + str(n) + ' steps.')
plt.plot([i*n for i in range(len(averaged_loss))], averaged_loss, label='avg PPO Loss')
plt.plot([i*n for i in range(len(averaged_loss))], averaged_rewards, label='avg Reward')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()
plot_filename = 'loss_and_reward_avg_plot.png'
plt.savefig(plot_filename)
plt.show()