In [38]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [39]:
EXAMPLE_FILE = '/content/gdrive/MyDrive/test.jsonl' #ваш путь к файлу test.jsnol
TASK_FILE = '/content/gdrive/MyDrive/train.jsonl' #ваш путь к файлу train.jsonl

In [40]:
import json
import numpy as np
import re
import torch
from tqdm import tqdm

In [41]:
torch.manual_seed(0)

import random
random.seed(0)

np.random.seed(0)

In [42]:
with open(EXAMPLE_FILE) as f:
    ex =  [json.loads(line) for line in f.readlines() if line]

In [43]:
ex[0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}

In [44]:
EX_IDS = np.random.randint(0, len(ex), size=100) #фиксируем набор примеров

In [45]:
with open(TASK_FILE) as f:
    data = [json.loads(line) for line in f.readlines() if line]

In [46]:
data[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [47]:
class GSMDataset(torch.utils.data.Dataset):


    def __init__(self, tokenizer, data, exs, example_size=8):

        self.data = data
        self.exs_data = exs
        self.tokenizer = tokenizer

        self.examples = self.make_examples(example_size)


    def make_examples(self, amount):

        examples = ''

        for id in EX_IDS[:amount]:

            q = self.exs_data[id]['question']
            a = self.exs_data[id]['answer']

            a = re.sub("\<.*?\>>", "", a)
            a = re.sub('\n', ' ', a)
            a = re.sub("####", "The answer is", a) + '.'

            examples += f'Q: {q}\nA: {a}\n'

        return examples


    def __len__(self):
        return len(self.data)


    def __getitem__(self, id):
        
        q = self.data[id]['question']
        a = self.data[id]['answer']

        a = float(re.sub('.*?\n|#|,', '', a))
        
        tokens = self.tokenizer(self.examples + f'Q: {q}\n', return_tensors='pt')['input_ids']

        return (tokens, a)

In [48]:
!pip install -q petals

In [49]:
from transformers import BloomTokenizerFast 
from petals import DistributedBloomForCausalLM

MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast.from_pretrained(MODEL_NAME)

In [50]:
dataset = GSMDataset(tokenizer, data, ex, 8)

In [51]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cpu


In [None]:
model = DistributedBloomForCausalLM.from_pretrained(MODEL_NAME)
model.to(DEVICE)

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/7.19G [00:00<?, ?B/s]

In [None]:
def generate_answ(task, answ_file): 

    with model.inference_session(max_length=2048) as sess:

        prefix = task[0].to(DEVICE)

        answ = ''

        try: #в случаее если ответ превысит допустимый размер
            while True:
                outputs = model.generate(
                    prefix, max_new_tokens=1, do_sample=True, top_p=0.9, temperature=0.75, session=sess
                )
                outputs = tokenizer.decode(outputs[0, -1:])

                answ += outputs
                if "\n" in outputs:
                    break
                prefix = None

            with open(answ_file, 'a') as f: #запись полученных ответов для их анализа

                example = {'A': answ}

                json_string = json.dumps(example)
                f.write(json_string)

            answ_file.close()

            try: #в случае если в ответе не содержится строки 'The answer is'
                
                answ = float(re.sub('\.|\,', '', answ.split('The answer is ')[1])) 
                return answ

            except Exception: return 'NO'

        except Exception: return 'NO'


def get_metric(dataset, answ_file, mode='cot', num_samples=4):

    acc = []

    if mode == 'cot':

        for task in tqdm(dataset):

            result = generate_answ(task, answ_file)
            
            if result != 'NO': acc.append(np.isclose(result, task[1], 1e-3))
            else: acc.append(False)

    if mode == 'sc':

        for task in tqdm(dataset):

            res = []

            for sample in range(num_samples):

                result = generate_answ(task, answ_file)
                res.append(result)

            values, counts = np.unique(res, return_counts=True)
            ind = np.argmax(counts) #ищем самый частый ответ (согласно статье этот подход даёт лучший результат)
            
            result = values[ind]

            if result != 'NO': acc.append(np.isclose(result, task[1], 1e-3))
            else: acc.append(False)

    return acc

## Сравнение CoT и Self-Consitenciy CoT в зависимости от количества поданых примеров.

In [None]:
#CoT

answ_cot = []

for i in range(1, 10):

    ANSW_FILE = f'answ_cot{i}.json' 

    dataset = GSMDataset(tokenizer, data, ex, example_size=i)

    answ_cot.append(get_metric(dataset, ANSW_FILE, mode='cot'))

In [None]:
#SC

answ_sc = []

for i in range(1, 10):

    ANSW_FILE = f'answ_sc{i}.json'

    dataset = GSMDataset(tokenizer, data, ex, example_size=i)

    answ_sc.append(get_metric(dataset, ANSW_FILE, mode='sc'))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
plt.title('CoT prompting')
plt.plot(np.arange(1, 10), np.array(answ_cot).mean(axis = 1))
plt.xlabel('examplars')
plt.ylabel('accuracy')

plt.subplot(1, 2, 2)
plt.title('SC prompting')
plt.plot(np.arange(1, 10), np.array(answ_sc).mean(axis = 1))
plt.xlabel('examplars')
plt.ylabel('accuracy')

plt.show()

In [None]:
cot_best = np.argmax(np.array(answ_cot).mean(axis = 1))
sc_best = np.argmax(np.array(answ_sc).mean(axis = 1))

## Исследование качества Self-Consistency от количества сгенерированных ответов

In [None]:
answ = []

dataset = GSMDataset(tokenizer, data, ex, example_size=sc_best)

for i in range(4, 20):

    ANSW_FILE = f'answ_sc_samp{i}.json'

    answ.append(get_metric(dataset, ANSW_FILE, mode='sc', num_samples=i))

In [None]:
plt.figure(figsize=(10, 10))

plt.title('SC prompting')
plt.plot(np.arange(4, 20), np.array(answ))
plt.xlabel('samples')
plt.ylabel('accuracy')

plt.show()

In [None]:
sc_best_samp_size = np.argmax(np.array(answ)) + 4

## Promting with equation only

In [None]:
class GSMDatasetEq(torch.utils.data.Dataset):


    def __init__(self, tokenizer, data, exs, example_size=8):

        self.data = data
        self.exs_data = exs
        self.tokenizer = tokenizer

        self.examples = self.make_examples(example_size)


    def make_examples(self, amount):

        ex_ids = np.random.randint(0, len(self.exs_data), size=amount)

        examples = ''

        for id in EX_IDS[:amount]:

            q = self.exs_data[id]['question']
            a = self.exs_data[id]['answer']

            eqs = re.findall('\<<.*?>\>', a)
            res = "The answer is" + re.sub('.*?\n|#|,', '', a) + '.'
            a = ''
            
            for eq in eqs:

                a = a + re.sub('<|>', '', eq) + '; '

            examples += f'Q: {q}\nA: {a} {res}\n'

        return examples


    def __len__(self):
        return len(self.data)


    def __getitem__(self, id):
        
        q = self.data[id]['question']
        a = self.data[id]['answer']

        a = float(re.sub('.*?\n|#|,', '', a))
        
        tokens = self.tokenizer(self.examples + f'Q: {q}\n', return_tensors='pt')['input_ids']

        return (tokens, a)

In [None]:
dataset = GSMDatasetEq(tokenizer, data, ex, example_size=2)

In [None]:
print(tokenizer.decode(dataset[0][0][0]))

In [None]:
#CoT

ANSW_FILE = f'answ_cot_eq.json'

dataset = GSMDatasetEq(tokenizer, data, ex, example_size=cot_best)

answ = np.mean(get_metric(dataset, ANSW_FILE, mode='cot'))

print(f'\n{answ}')

In [None]:
#SC

ANSW_FILE = f'answ_sc_eq.json'

dataset = GSMDatasetEq(tokenizer, data, ex, example_size=sc_best)

answ = np.mean(get_metric(dataset, ANSW_FILE, mode='sc', num_samples=sc_best_samp_size))

print(f'\n{answ}')

## Вывод

К сожалению, не получилось провести предложенные выше эксперименты из-за плохого доступа к BLOOM. Однако, можно предположить, что результаты могут быть схожи с GPT-3 в силу примерно равного числа парметров (176 против 175).