### Reinforcement learning & LLMs:
#### Usando REINFORCE para aprender los parametros de un modelo que maximiza la recompensa al resolver problemas matemáticos


In [3]:
!pip install -q -U transformers

In [None]:
!huggingface-cli login

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn 
from math_verify import parse, verify

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



#### El modelo que usaremos como policy network es gemma-2b

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-1.1-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-1.1-2b-it", torch_dtype=torch.bfloat16)

#### Policy es la clase que usamos para representar a nuestro lm como la red que maximiza la recompensa, al cambiar sus parametros

#### * para seleccionar la acción , hacemos un sampling desde la distribución de tokens
#### * calculamos log pobability de ese token, esto nos permite tener un rango más grande y reduce costos computacionales

In [None]:
class Policy:
    #gamma es el hiperparametro que usamos psra descontar recompensas futuras 
    def __init__(self,pretrained_model,gamma = 0.99):
        super(Policy,self).__init__()
        self.model = pretrained_model
        self.tokenizer = tokenizer
        self.gamma = gamma
        # guardamos los log probabilities de cada accion
        self.policy_history = []
        #guardamos las recompensas de cada episodio
        self.reward_episode = []
    def forward(self,input_ids):
        output = self.model(input_ids)
        logits = outputs.logits([:,-1,:])
        probs = nn.Softmax(dim=-1)(logits)
        #retorna un tensor del tamano vocab size, con las probabilidades, pasado por un sofmax
        return probs
    def select_action(policy,input_ids):
        probs = policy(input_ids)
        #convertimos a los tokens en una distribucion y hacemos sampling
        dist = torch.distributions.Categorical(probs=probs)
        action = dist.sample()
        #calculamos log_probs
        log_prob = dist.log_prob(action)
        policy.policy_history.append(log_prob)
        return action

### Ajustando los parámetros del modelo
#### * damos prioridad a recompensas mas cercanas 
#### * para clacular loss: 
#### - asignamos recompensas a cada token, todos son cero excepto el ultimo (la respuesta)
#### - ajustamos las recompensas a discounted_rewards[] usando gamma. esto da a cada token generado "credito" por la respuesta
#### - loss se calcula como -= log_prob *  recompensa

In [None]:
def update_policy(policy,optimizer):
    discounted_rewards = []
    running_reward = 0
    #recorrer la lista desde el final
    for r in policy.reward_episode[::-1]:
        running_reward = r + policy.gamma * running_reward
        discounted_rewards.insert(0,running_reward)
    discounted_rewards = torch.tensor(discounted_rewards,dtype=torch.float32)
    if discounted_rewards.std() > 0:
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
    loss = 0
    #multiplicamos la probabilidad del token por su recompensa
    for log_prob,reward in zip(policy.policy_history,discounted_rewards):
        loss -= log_prob * reward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    policy.policy_history = []
    policy.reward_history = []

### Por que se usa el negativo en loss?
#### Log probabilities son negativas, la multiplicacion da una respuesta negativa y necesitamos aumentar la probabilidad de buenas acciones

#### tratamos al modelo como un policy network, que genera acciones(tokens)
#### Estado: el prompt
#### Accion: cada unpo de los tokens generados 
#### Recompensa: 1 si la respuesta es correcta, 0.5 por un buen razonamiento, 0 de otro modo



In [None]:
class Policy:
    #gamma es el hiperparametro que usamos psra descontar recompensas futuras 
    def __init__(self,pretrained_model,gamma = 0.99):
        super(Policy,self).__init__()
        self.model = pretrained_model
        self.tokenizer = tokenizer
        self.gamma = gamma
        # guardamos los log probabilities de cada accion
        self.policy_history = []
        #guardamos las recompensas de cada episodio
        self.reward_episode = []
    def forward(self,input_ids):
        output = self.model(input_ids)
        logits = outputs.logits([:,-1,:])
        probs = nn.Softmax(dim=-1)(logits)
        #retorna un tensor del tamano vocab size, con las probabilidades, pasado por un sofmax
        return probs
    def select_action(policy,input_ids):
        probs = policy(input_ids)
        #convertimos a los tokens en una distribucion y hacemos sampling
        dist = torch.distributions.Categorical(probs=probs)
        action = dist.sample()
        #calculamos log_probs
        log_prob = dist.log_prob(action)
        policy.policy_history.append(log_prob)
        return action
policy = Policy(model)

### Entrenamiento:
#### usamos Adam para actualizar parametros de gemma-2b


In [None]:
def train_reasoning(policy,episodes = 10,max_length=5):
    optimizer = torch.optim.Adam(policy.parameters(),lr=0.0001)
    for episode in range(episode):
        prompt = "what is 3 + 5"
        input_ids = tokenizer(prompt,return_tensors = "pt").input_ids
        generated = input_ids.clone()
        for x in range(max_length):
            action = select_action(policy,generated)##$
            generated = torch.cat([generated,action.unsqueeze(0)],dim=1)
            policy.reward_episode.append(0)
        output_text = tokenizer.decode(generated[0],skip_special_tokens=True)
        print(f"Episode {episode}: Generated: {output_text}")
        reward = 1 if "8" in putput_text else 0
        policy.reward_episode[-1] = reward
        print(f"Reward: {reward}")
        update_policy(policy,optimizer)
        

train_reasoning(policy,episodes=10)

In [7]:
def calculate_reward():
    ## todo get answer from dataset
    gold = parse("10x - 15")
    answer = parse("3*(2x - 5) + 4x") 
    result = verify(gold,answer)
    return result

    
    

False


False