# Imports

In [1]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, pipeline

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler

import json

import requests
import re


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/chloe/mambaforge/envs/python3.9/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/chloe/mambaforge/envs/python3.9/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/chloe/mambaforge/envs/python3.9/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


# Utils

In [2]:
def cfeedback(v):
  '''
  Returns the compiler error if one exists. Returns None if everything compiles cleanly.
  '''
  r = requests.post("https://coq.livecode.ch/check", data = { 'v': v }).json()
  if r['status'] == 0:
    return None
  r = r['log']
  return r

In [3]:
def get_linenumber(cf):
  pattern = r'line (\d+),'
  match = re.search(pattern, cf)  
  if match:
    line_number = int(match.group(1))
  else:
    line_number = -1
  return line_number

In [4]:
def get_totallines(response):
    return len(response.split('\n'))

In [5]:
def get_line(line_number, response):
    broken = response.split('\n')
    return broken[line_number-1]

In [6]:
config = PPOConfig(
    model_name="edbeeching/gpt-neo-125M-imdb-lora-adapter-merged",
    learning_rate=1.41e-5,
    log_with='wandb',
    mini_batch_size=1,# prev: 16
    batch_size=1, # prev: 256, but working with super limited samples so will try lower batch size for now
    # gradient_accumulation_steps=1, --> apparently this is unrecognized
)

# We then define the arguments to pass to the sentiment analysis pipeline.
# We set `return_all_scores` to True to get the sentiment score for each token.
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": config.mini_batch_size}

def build_dataset(config, dataset_name="../MBPP dataset/MBPP_Coq_Test.csv"):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.
    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.
    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    ds = load_dataset("csv", data_files=dataset_name, split="train")

    def concat(sample):
      ex = sample['specification'] + "Test case 1: " + sample['test_case_1'] + \
      ", test case 2: " + sample['test_case_2'] + ", test case 3: " + sample['test_case_3']
      return ex

      # return sample['specification'] + "Test case 1: " + sample['test_case_1'] + \
      # ", test case 2: " + sample['test_case_2'] + ", test case 3: " + sample['test_case_3'] + " Prove some formal properties. Please only write code for the last stand-alone example. *)"


    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(concat(sample))
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

# multi-shot boilerplate
multishot = "(* Stand-alone Example 1: Write a function that doubles a number. Test case 1: double 3 = 6. Prove some formal properties. *) \nFixpoint double (n: nat): nat := match n with | 0 => 0 | S n => S (S (double n)) end. \n\nLemma example_double_3: double 3 = 6.\nProof. simpl. reflexivity. Qed. \n\n Theorem theorem_double_distribute: \nforall a b, double a + double b = double (a + b).\n Proof.\n intros.\n induction a.\n - simpl. reflexivity.\n - simpl. rewrite IHa. reflexivity. \n Qed. \n\n (* Stand-alone Example 2: Write a function that creates a list of n elements. Test case 1: replicate 1 0 = []. Test case 2: replicate 1 2 = [1; 1]. Prove some formal properties. *) \n Require Import Coq.Lists.List. \n Open Scope list_scope. \n Import ListNotations. \n Fixpoint replicate {X: Type} (x: X) (n: nat): list X := \n match n with \n | 0 => []\n | S n => x :: replicate x n \n end. \n Lemma example_replicate_0: replicate 1 0 = []. \n Proof. simpl. reflexivity. Qed.\n Lemma example_replicate_2: replicate 1 2 = [1; 1].\n Proof. simpl. reflexivity. Qed.\n\n Theorem replicate_length:\n\t forall n, length (replicate 1 n) = n.\n Proof. \n intros. \n induction n.\n - simpl. reflexivity. \n - simpl. rewrite IHn. reflexivity.\n Qed. \n Theorem replicate_length_any: \n\t forall (X: Type) (x: X) n, length (replicate x n) = n. \n Proof.\n intros. \n induction n.\n - simpl. reflexivity.\n- simpl. rewrite IHn. reflexivity.\n Qed."

# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_dataset(config)

Found cached dataset csv (/home/chloe/.cache/huggingface/datasets/csv/default-d8f13bc6a5969487/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/chloe/.cache/huggingface/datasets/csv/default-d8f13bc6a5969487/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-1598b081ffb826bd.arrow


# Load Model

In [9]:
import torch
import gc

# Manually trigger Python's garbage collector
gc.collect()

# Clear the CUDA cache
torch.cuda.empty_cache()

In [6]:
systemText = """ You are an AI assistant helping users write Coq code in order to implement given function specifications. 
1. The program you write should only contain Coq code in response to the given function specification. 
3. Any step-by-step reasoning that is not Coq code should be written as a comment.
3. As the user provides compiler feedback, modify and update the program accordingly and keep the variables and the general program structure consistent.
4. In addition to implementing the function, give at least 2 properties as theorems with their proofs.

The following are examples.

### Human:
Write a function that creates a list of n elements. Test case 1: replicate 1 0 = []. Test case 2: replicate 1 2 = [1; 1]. 

### Assistant:
\`\`\`
Require Import Coq.Lists.List. 
 Open Scope list_scope. 
 Import ListNotations. 
 Fixpoint replicate (x: X) (n: nat): list X := 
 match n with 
 | 0 => []
 | S n => x :: replicate x n 
 end. 
 Lemma example_replicate_0: replicate 1 0 = []. 
 Proof. simpl. reflexivity. Qed.
 Lemma example_replicate_2: replicate 1 2 = [1; 1].
 Proof. simpl. reflexivity. Qed.

 Theorem replicate_length:
	 forall n, length (replicate 1 n) = n.
 Proof. 
 intros. 
 induction n.
 - simpl. reflexivity. 
 - simpl. rewrite IHn. reflexivity.
 Qed. 
 Theorem replicate_length_any: 
	 forall (X: Type) (x: X) n, length (replicate x n) = n. 
 Proof.
 intros. 
 induction n.
 - simpl. reflexivity.
- simpl. rewrite IHn. reflexivity.
 Qed.
\`\`\`

### Human:
Your code produces an error in the line Fixpoint replicate (x: X) (n: nat): list X :=\n{}Can you please explain what this error means? Let's think step by step. Please rewrite all code if you rewrite any code.
File \"./ex.v\", line 4, characters 24-25:\nError: The reference X was not found in the current environment.

### Assistant:
\`\`\`
Require Import Coq.Lists.List. 
 Open Scope list_scope. 
 Import ListNotations. 
 Fixpoint replicate '{X: Type'} (x: X) (n: nat): list X := 
 match n with 
 | 0 => []
 | S n => x :: replicate x n 
 end. 
 Lemma example_replicate_0: replicate 1 0 = []. 
 Proof. simpl. reflexivity. Qed.
 Lemma example_replicate_2: replicate 1 2 = [1; 1].
 Proof. simpl. reflexivity. Qed.

 Theorem replicate_length:
	 forall n, length (replicate 1 n) = n.
 Proof. 
 intros. 
 induction n.
 - simpl. reflexivity. 
 - simpl. rewrite IHn. reflexivity.
 Qed. 
 Theorem replicate_length_any: 
	 forall (X: Type) (x: X) n, length (replicate x n) = n. 
 Proof.
 intros. 
 induction n.
 - simpl. reflexivity.
- simpl. rewrite IHn. reflexivity.
 Qed.
\`\`\`"""

In [7]:
from transformers import GenerationConfig, LlamaTokenizer, BitsAndBytesConfig

base_model = "/data/text-generation-webui/models/vicuna-7b"

tokenizer = LlamaTokenizer.from_pretrained(base_model)
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    quantization_config=quantization_config,
)

model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
prompt = systemText + "\n" + "### Human: Write a Coq function to reverse a list.\n###Assistant: "
device = "cuda"
temperature = 0.7
top_p = 0.75
top_k = 40
num_beams = 4
max_new_tokens = 500

inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams
        )

generate_params = {
            "input_ids": input_ids,
            "generation_config": generation_config,
            "return_dict_in_generate": True,
            "output_scores": True,
            "max_new_tokens": max_new_tokens,
        }
with torch.no_grad():
        generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                return_dict_in_generate=True,
                output_scores=True,
                max_new_tokens=max_new_tokens,
            )
        s = generation_output.sequences[0]
        output = tokenizer.decode(s)

OutOfMemoryError: CUDA out of memory. Tried to allocate 390.00 MiB (GPU 0; 23.70 GiB total capacity; 22.35 GiB already allocated; 317.19 MiB free; 22.36 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print(output)

<s> ### Human: what color is the sky?
###Assistant: 

The color of the sky can vary depending on the time of day and the weather conditions. During the daytime, when the sun is shining, the sky is usually a bright blue color. However, when the sun is setting or when there are clouds in the sky, the color of the sky can range from pink to orange to purple.</s><s>


In [8]:
# loading Llama
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

model_id = "/data/text-generation-webui/models/vicuna-7b"

quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

model_8bit = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

# Generate Llama Samples

In [9]:
firstOne = dataset[0]['specification']
firstOne

'Write a Coq function to remove first and last occurrence of a given character from the string.'

In [13]:
# Encode the input text
input_text = firstOne
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, do_sample=True)

# Decode the output
generated_text = tokenizer.decode(output[0].to("cpu"), skip_special_tokens=True)

print(generated_text)

RuntimeError: Tensor on device cpu is not on the expected device meta!

In [None]:
systemText = """ You are an AI assistant helping users write Coq code in order to implement given function specifications. 
1. The program you write should only contain Coq code in response to the given function specification. 
3. Any step-by-step reasoning that is not Coq code should be written as a comment.
3. As the user provides compiler feedback, modify and update the program accordingly and keep the variables and the general program structure consistent.
4. In addition to implementing the function, give at least 2 properties as theorems with their proofs.

The following are examples.

Query from user:
Write a function that creates a list of n elements. Test case 1: replicate 1 0 = []. Test case 2: replicate 1 2 = [1; 1]. 

Response from assistant:
\`\`\`
Require Import Coq.Lists.List. 
 Open Scope list_scope. 
 Import ListNotations. 
 Fixpoint replicate (x: X) (n: nat): list X := 
 match n with 
 | 0 => []
 | S n => x :: replicate x n 
 end. 
 Lemma example_replicate_0: replicate 1 0 = []. 
 Proof. simpl. reflexivity. Qed.
 Lemma example_replicate_2: replicate 1 2 = [1; 1].
 Proof. simpl. reflexivity. Qed.

 Theorem replicate_length:
	 forall n, length (replicate 1 n) = n.
 Proof. 
 intros. 
 induction n.
 - simpl. reflexivity. 
 - simpl. rewrite IHn. reflexivity.
 Qed. 
 Theorem replicate_length_any: 
	 forall (X: Type) (x: X) n, length (replicate x n) = n. 
 Proof.
 intros. 
 induction n.
 - simpl. reflexivity.
- simpl. rewrite IHn. reflexivity.
 Qed.
\`\`\`

Query from user:
Your code produces an error in the line Fixpoint replicate (x: X) (n: nat): list X :=\n{}Can you please explain what this error means? Let's think step by step. Please rewrite all code if you rewrite any code.
File \"./ex.v\", line 4, characters 24-25:\nError: The reference X was not found in the current environment.

Response from assistant:
\`\`\`
Require Import Coq.Lists.List. 
 Open Scope list_scope. 
 Import ListNotations. 
 Fixpoint replicate '{X: Type'} (x: X) (n: nat): list X := 
 match n with 
 | 0 => []
 | S n => x :: replicate x n 
 end. 
 Lemma example_replicate_0: replicate 1 0 = []. 
 Proof. simpl. reflexivity. Qed.
 Lemma example_replicate_2: replicate 1 2 = [1; 1].
 Proof. simpl. reflexivity. Qed.

 Theorem replicate_length:
	 forall n, length (replicate 1 n) = n.
 Proof. 
 intros. 
 induction n.
 - simpl. reflexivity. 
 - simpl. rewrite IHn. reflexivity.
 Qed. 
 Theorem replicate_length_any: 
	 forall (X: Type) (x: X) n, length (replicate x n) = n. 
 Proof.
 intros. 
 induction n.
 - simpl. reflexivity.
- simpl. rewrite IHn. reflexivity.
 Qed.
\`\`\`"""

messages=[{"role": "system", "content": systemText}]

def generate(q):
  '''
  Generate output from the correct model and clean it from pre- and post- rambles if possible.
  ''' 
  # make this script retry if the connection is rejected for some reason
  while True:
    try:
      messages.append({"role": "user", "content": q})
      response = openai.ChatCompletion.create(
                    model='gpt-4-0314', 
                    messages=messages)
      response = response.choices[0].message.content
      messages.append({"role": "assistant", "content": response})
      
      # clean the response if possible
      c_response = response
      try:
        match = re.search('```coq(.*?)```', c_response, re.DOTALL)
        c_response = match.group(1)
      except:
        pass
      try:
        match = re.search('```(.*?)```', c_response, re.DOTALL)
        c_response = match.group(1)
      except:
        pass
      return c_response
    except:
      pass

def run_trial(q_core, pid, outfile, verbose=True, ntrials=10):
  '''
  Runs one trial on one prompt. 
  - q: function spec with test cases
  - pid: the prompt id
  '''
  q = q_core
  if verbose:
    print("The task: {}".format(q))

  for t in range(ntrials): 
    # for recording the dataset
    out = {
            "prompt_id": pid,
            "iteration": t,
            "instruction": q,
            "output": None,
            "compiler_feedback": None,
            "stats": {
                        "total_lines" : None,
                        "compiled_lines": None,
                        "percent_compiled": None
                    }
            }

    # generate model response
    response = generate(q)

    # get compiler feedback
    cf = cfeedback(response)

    if verbose:
      print("-----Attempt {}---------".format(t))
      print(response)

    if cf is not None:
      line_number = get_linenumber(cf) - 1
      total_lines = get_totallines(response)
      percent_compiled = (line_number)/total_lines
      linetxt = get_line(line_number, response)

      # get the model to reflect on the error
      q = "Your code produces an error in the line {}\n{}Can you please explain what this error means? Let's think step by step. Please rewrite all code if you rewrite any code."\
        .format(linetxt, cf)
      if verbose:
        print(q)
        print(percent_compiled)
    else:
      total_lines = get_totallines(response)
      line_number = total_lines
      percent_compiled = 1.0
      q = "The model solved the problem!"
      if verbose:
        print(q)
        print(percent_compiled)

    # append all data to json lines file
    out["output"] = response
    out["compiler_feedback"] = cf
    out["stats"]["total_lines"] = total_lines
    out["stats"]["compiled_lines"] = line_number
    out["stats"]["percent_compiled"] = percent_compiled

    with open(outfile, 'a') as file:
      file.write(json.dumps(out) + '\n')
    if verbose:
      print("recorded in {}".format(outfile))

    # don't continue if model has completely solved problem
    if cf is None:
      break

  return None

def main():
  pass

if __name__ == "__main__":
  main()
  outfile = "Dolly_EasyMediumHard01.ndjson"
  # run_trial(q, 0, outfile)
  for i in range(0,3):
    q = dataset[i]['query'] 
    run_trial(q, i, outfile)
