In [20]:
import os
from xml.etree import ElementTree
import numpy as np
import torch
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GPTNeoForCausalLM

torch.manual_seed(42)
np.random.seed(42)

In [22]:
cur_dir = os.getcwd()
os.chdir(os.path.join(cur_dir, 'data'))
!git clone https://gitlab.cs.washington.edu/ALGES/TACL2015.git
!git clone https://github.com/chaochun/nlu-asdiv-dataset.git
!git clone https://github.com/openai/grade-school-math.git
os.chdir(cur_dir)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
fatal: destination path 'TACL2015' already exists and is not an empty directory.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
fatal: destination path 'nlu-asdiv-dataset' already exists and is not an empty directory.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLE

In [2]:
def read_string_from_file(path):
    with open(path, 'r') as f:
        return f.read()

def sample_asdiv(dataset_path):
    dom = ElementTree.parse(dataset_path)

    #XML parsing
    body_list = dom.findall('ProblemSet/Problem/Body')
    answer_list = dom.findall('ProblemSet/Problem/Answer')
    question_list = dom.findall('ProblemSet/Problem/Question')
    formula_list = dom.findall('ProblemSet/Problem/Formula')
    stype_list = dom.findall('ProblemSet/Problem/Solution-Type')

    #Randomly choose a problem
    rand_index = np.random.randint(0, len(body_list))

    return f"{body_list[rand_index].text} {question_list[rand_index].text}", formula_list[rand_index].text

def sample_gsm8k(dataset_path):
    with open(dataset_path) as fh:
        data = [json.loads(line) for line in fh.readlines() if line]

    # Randomly choose a problem
    rand_index = np.random.randint(0, len(data))
    problem = data[rand_index]
    return problem['question'], re.findall(r"#### \w+", problem["answer"])[0][5:]

def sample_singleEq(dataset_path):
    with open(dataset_path, 'r') as f:
        data = json.load(f)
    
    # Randomly choose a problem
    rand_index = np.random.randint(0, len(data))
    problem = data[rand_index]
    return problem['sQuestion'], problem['lSolutions']

In [3]:
#genji_model = ""
gptj_model = "EleutherAI/gpt-j-6B"
codeparrot_model = "lvwerra/codeparrot"

asdiv_path = "data/nlu-asdiv-dataset/dataset/ASDiv.xml"
gsm8k_path = "data/grade-school-math/grade_school_math/data/train.jsonl"
singleEq_path = "data/TACL2015/questions.json" 

In [22]:
"""Choose the dataset you want to test"""
#dataset_path = gsm8k_path
#dataset_path = singleEq_path
dataset_path = asdiv_path

"""Load the priming text to add to the prompt and sample a question"""
#priming_text = read_string_from_file("data/priming_texts/gsm8k.txt")
#priming_text = read_string_from_file("data/priming_texts/singleEq.txt")
priming_text = read_string_from_file("data/priming_texts/asdiv.txt")

#sample_q, sample_a = sample_gsm8k(dataset_path)
#sample_q, sample_a = sample_singleEq(dataset_path)
sample_q, sample_a = sample_asdiv(dataset_path)

In [23]:
prompt = f"{priming_text}\n\n#{sample_q}"
print(prompt)
print("\n" + "-"*100 + "\n")

#Sandra took six cups of coffee and Marcie took two cups of coffee. print the number of cups of coffee did Sandra and Marcie take in total?
cups_sandra = 6
cups_marcie = 2
total_cups = cups_sandra + cups_marcie
print(total_cups)

#Mrs. Franklin had 58 Valentines. Mrs. Franklin gave some to her students. Now she has 16. Write a program that prints how many Valentines did Mrs. Franklin give to her students?
valentines_franklin = 58
valentines_students = 16
total_valentines = valentines_franklin - valentines_students
print(total_valentines)

#Susie's father repaired the bookshelves in the reading room. If he has 210 books to be distributed equally on the 10 shelves he repaired, write a program that prints how many books will each shelf contain?
books_susie = 210
shelves_susie = 10
books_per_shelf = books_susie / shelves_susie
print(books_per_shelf)

#Michelle likes to save money every now and then so that she has money to buy the things that she wants. One day, she decided to count her sa

In [11]:
"""GPT-J and codeparrot models run in HFTest venv"""
tokenizer = AutoTokenizer.from_pretrained(gptj_model)
model = AutoModelForCausalLM.from_pretrained(gptj_model).eval().cuda()

In [11]:
"""Genji model run in HFTest_genji venv"""
model = AutoModelForCausalLM.from_pretrained("NovelAI/genji-python-6B").eval().cuda()
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")

Downloading: 100%|██████████| 1.43k/1.43k [00:00<00:00, 1.18MB/s]
Downloading: 100%|██████████| 12.6G/12.6G [21:51<00:00, 9.59MB/s]
Downloading: 100%|██████████| 1.46k/1.46k [00:00<00:00, 1.32MB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 1.92MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 831kB/s] 
Downloading: 100%|██████████| 90.0/90.0 [00:00<00:00, 83.2kB/s]
Downloading: 100%|██████████| 200/200 [00:00<00:00, 178kB/s]


In [26]:
tokens = tokenizer(prompt, return_tensors="pt").input_ids
generated_tokens = model.generate(tokens.long().cuda(), 
                                  use_cache=True, 
                                  do_sample=True, 
                                  top_k=50, 
                                  temperature=0.4, 
                                  top_p=0.9, 
                                  repetition_penalty=1.125, 
                                  min_length=1, 
                                  max_length=len(tokens[0]) + 100, 
                                  pad_token_id=tokenizer.eos_token_id)

last_tokens = generated_tokens[0][len(tokens[0]):]
generated_text = tokenizer.decode(last_tokens)
print("Generation:\n" + generated_text)

Generation:

age_peter = 30
age_bryan = 25
age_philip = 35
sum_of_ages = peter + bryan + philip
print(sum_of_ages)

#A man bought three pairs of shoes at $10 each. He paid for them with a credit card. The price of the credit card was $20. What was the cost of the shoes?
cost_shoes = 10
credit_card_price
