In [3]:
import pandas as pd
import json
import re
import string
import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from rouge import Rouge

# 0. Loading Dependencies

In [1]:
# !pip install galai
!pip install pandas
!pip install -U scikit-learn
!pip install rouge

Collecting pandas
  Downloading pandas-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.0 tzdata-2023.3
[0mCollecting scikit-learn
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m14.5 MB/s[0m et

In [21]:
# !pip install transformers
# !pip install sentencepiece
# !pip install accelerate

### 0.1 Reading JSON 

In [4]:
with open('questions_file.json', 'r') as f:
    data = json.load(f)

In [5]:
data_pd = pd.DataFrame(data)
print ("Data shape is", data_pd.shape, "\n")
data_pd.head()

Data shape is (200, 2) 



Unnamed: 0,question,ground_truth
781,Four friends ordered four pizzas for a total o...,The other two pizzas cost 64-30 = <<64-30=34>...
171,Jake is walking through the Museum of Entomolo...,First find the total number of spider legs: 8...
1081,Frankie and Binkie went bowling together. Fra...,Twice Binkie's score is 2*90=<<2*90=180>>180....
1212,Ali is a dean of a private school where he tea...,Each of John’s classes has a capacity of 120 ...
797,Carrie is planning the caroling schedule. The ...,First find the total time the choir spends si...


In [6]:
# Dictionary to lists for inference 
questions = list(data["question"].values())
ground_truth = list(data["ground_truth"].values())

In [7]:
# All answers are integers 
final_answers = [int(ground_truth[i].split("\n")[-1].split(":")[-1]) for i in range(len(ground_truth))]
print ("The final answers are INTEGERS between", min(final_answers), "&", max(final_answers))

The final answers are INTEGERS between 2 & 224000


### 0.2 Preprocessing

In [8]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove whitespaces
    text = text.strip()
    text = text.replace("\n", " ")
    return text

In [9]:
pre_questions = [preprocess_text(questions[i]) for i in range(len(questions))]
pre_ground_truth = [preprocess_text(ground_truth[i]) for i in range(len(ground_truth))]

In [10]:
print("Original Question:", questions[0],"\n")
print("Pre-Processed Question:", pre_questions[0],"\n")

print("Original Ground Truth:", ground_truth[0],"\n")
print("Pre-Processed Ground Truth:", pre_ground_truth[0],"\n")

Original Question: Four friends ordered four pizzas for a total of 64 dollars. If two of the pizzas cost 30 dollars, how much did each of the other two pizzas cost if they cost the same amount? 

Pre-Processed Question: four friends ordered four pizzas for a total of 64 dollars. if two of the pizzas cost 30 dollars, how much did each of the other two pizzas cost if they cost the same amount? 

Original Ground Truth:  The other two pizzas cost 64-30 = <<64-30=34>>34 dollars.
Each of the other two pizzas cost 34/2 = <<34/2=17>>17 dollars each.
A: 17 

Pre-Processed Ground Truth: the other two pizzas cost 64-30 = <<64-30=34>>34 dollars. each of the other two pizzas cost 34/2 = <<34/2=17>>17 dollars each. a: 17 



### 0.3 Univariate Statistics

In [11]:
def count_words(sentence):
    return len(sentence.split())

def find_max_words(sentences_list):
    max_words = 0
    for sentence in sentences_list:
        word_count = count_words(sentence)
        if word_count > max_words:
            max_words = word_count
    return max_words

In [12]:
# Maximum questions and answer size
find_max_words(pre_questions)

100

In [13]:
find_max_words(pre_ground_truth)

106

### 0.4 Metrics

In [14]:
def calculate_rouge(predicted_answers, reference_answers):
    # ROUGE-L
    rouge = Rouge()
    rouge_scores = rouge.get_scores(predicted_answers, reference_answers, avg=True)
    rouge_l = rouge_scores['rouge-l']['f']
    return rouge_l

In [15]:
def calculate_metrics(predicted_answers, reference_answers):
    # Accuracy
    accuracy = accuracy_score(reference_answers, predicted_answers)

    # Precision, Recall, F1 Score
    _, _, f1, _ = precision_recall_fscore_support(reference_answers, predicted_answers, average='weighted', zero_division=0)
    
    # Mean Reciprocal Rank (MRR)
    mrr = 0
    for i, (predicted, reference) in enumerate(zip(predicted_answers, reference_answers)):
        if predicted == reference:
            mrr += 1 / (i + 1)
    mrr /= len(predicted_answers)
    return accuracy, f1, mrr

# 1. GPT NEO - 350 M

In [55]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

In [100]:
model = GPTNeoForCausalLM.from_pretrained("xhyi/PT_GPTNEO350_ATG")
tokenizer = GPT2Tokenizer.from_pretrained("xhyi/PT_GPTNEO350_ATG")

### 1.1 Inference

In [65]:
# Define a function to generate a response
def generate_response(question):
    input_text = f"question: {question}"
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")

    # Generate the output tokens
    output_tokens = model.generate(input_tokens, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [83]:
answers = []
for question in tqdm.tqdm(questions):
    answer = generate_response(question)
    answer = preprocess_text(answer.split("\nA:")[-1])
    answers.append(answer)

### 1.2 Evaluation

In [85]:
rouge_l = calculate_rouge(answers, pre_ground_truth)
print("ROUGE-L:", rouge_l)

# 2. Google Flan-T5-Base

In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [15]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:
model = model.to("cuda")

### 2.1 Inference

In [28]:
# Define a function to generate a response
def generate_response(question):
    input_text = f"question: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [33]:
preds = []
for question in tqdm.tqdm(pre_questions):
    pred_label = generate_response(question)
    preds.append(pred_label)

100%|██████████| 200/200 [02:18<00:00,  1.45it/s]


In [38]:
preds[0] # Not able to perform mathematical calculations

'The first pizza cost 64 - 30 = $36. The second pizza cost 36 - 30 = $36. The third pizza cost 36 - 30 = $36. The fourth pizza cost 36 - 36 = $36. The other two pizzas cost 36 - 36 = $36. The other two pizzas cost 36 - 36 = $36. The other two pizzas cost 36 - 36 = $36. The other two pizzas cost 36 -'

### 2.2 Evaluation

In [34]:
accuracy, f1, mrr = calculate_metrics(preds, pre_ground_truth)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Mean Reciprocal Rank (MRR):", mrr)

Accuracy: 0.0
F1 Score: 0.0
Mean Reciprocal Rank (MRR): 0.0


In [35]:
rouge_l = calculate_rouge(preds, pre_ground_truth)
print("ROUGE-L:", rouge_l)

ROUGE-L: 0.25710498885709737


# 3. Google Flan-T5-XL

In [109]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [110]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [111]:
model = model.to("cuda")

### 3.1 Inference

In [107]:
# Define a function to generate a response
def generate_step_by_step_response(question):
    input_text = f"Answer the following mathematical word problem by reasoning step-by-step: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [29]:
preds = []
for question in tqdm.tqdm(pre_questions):
    pred_label = generate_response(question)
    preds.append(pred_label)

100%|██████████| 200/200 [01:11<00:00,  2.80it/s]


In [52]:
accuracy, f1, mrr = calculate_metrics(preds, [str(i) for i in final_answers])
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Mean Reciprocal Rank (MRR):", mrr)

Accuracy: 0.01
F1 Score: 0.013333333333333332
Mean Reciprocal Rank (MRR): 0.00013938466768655448


### 3.2 Evaluation

In [53]:
long_preds = []
for question in tqdm.tqdm(pre_questions):
    pred_label = generate_step_by_step_response(question)
    long_preds.append(pred_label)

100%|██████████| 200/200 [05:54<00:00,  1.77s/it]


In [54]:
rouge_l = calculate_rouge(long_preds, pre_ground_truth)
print("ROUGE-L:", rouge_l)

ROUGE-L: 0.39259634785986214


# B. Prompting Strategies

### 4.1 Prompt 1 - CoT

In [108]:
"""
use the first “reasoning” prompt
to extract a full reasoning path from a language model, and then use the second “answer” prompt to
extract the answer in the correct format from the reasoning text
"""

'\nuse the first “reasoning” prompt\nto extract a full reasoning path from a language model, and then use the second “answer” prompt to\nextract the answer in the correct format from the reasoning text\n'

In [89]:
# Define a function to generate a response
def prompt_1(question):
    input_text = f"Q: {question} \n A: Let's think step by step"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [97]:
# Define a function to generate a response
def prompt_1_extraction(question, prompt_1_ans):
    input_text = f"Q: {question} \n A: Let's think step by step {prompt_1_ans} Therefore, the answer (arabic numerals) is"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [92]:
preds_prompt_1 = []
for question in tqdm.tqdm(pre_questions):
    pred_label = prompt_1(question)
    preds_prompt_1.append(pred_label)

100%|██████████| 200/200 [05:26<00:00,  1.63s/it]


In [98]:
preds_prompt_1_extrct = []
for i in tqdm.tqdm(range(len(pre_questions))):
    question = pre_questions[i]
    prompt_1_ans = preds_prompt_1[i]
    pred_label = prompt_1_extraction(question, prompt_1_ans)
    preds_prompt_1_extrct.append(pred_label)

100%|██████████| 200/200 [00:43<00:00,  4.55it/s]


### 4.1.1 Prompt 1 - Evaluation

In [93]:
rouge_l = calculate_rouge(preds_prompt_1, pre_ground_truth)
print("ROUGE-L:", rouge_l)

ROUGE-L: 0.39106372368937337


In [99]:
accuracy, f1, mrr = calculate_metrics(preds_prompt_1_extrct, [str(i) for i in final_answers])
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Mean Reciprocal Rank (MRR):", mrr)

Accuracy: 0.105
F1 Score: 0.11126984126984127
Mean Reciprocal Rank (MRR): 0.0012522336645191129


### 4.2 Prompt 2 - CoT

In [112]:
# Define a function to generate a response
def prompt_2(question):
    input_text = f"Question: {question} \n A: Let's think step by step"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [116]:
# Define a function to generate a response
def prompt_2_extraction(question, prompt_1_ans):
    input_text = f"Question: {question} \n A: Let's think step by step {prompt_1_ans} Therefore, the answer (arabic numerals) is"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [114]:
preds_prompt_2 = []
for question in tqdm.tqdm(pre_questions):
    pred_label = prompt_2(question)
    preds_prompt_2.append(pred_label)

100%|██████████| 200/200 [05:27<00:00,  1.64s/it]


In [117]:
preds_prompt_2_extrct = []
for i in tqdm.tqdm(range(len(pre_questions))):
    question = pre_questions[i]
    prompt_2_ans = preds_prompt_2[i]
    pred_label = prompt_2_extraction(question, prompt_2_ans)
    preds_prompt_2_extrct.append(pred_label)

100%|██████████| 200/200 [00:46<00:00,  4.33it/s]


### 4.2.1 Prompt 2 - Evaluation

In [118]:
rouge_l = calculate_rouge(preds_prompt_2, pre_ground_truth)
print("ROUGE-L:", rouge_l)

ROUGE-L: 0.38860299268097004


In [119]:
accuracy, f1, mrr = calculate_metrics(preds_prompt_2_extrct, [str(i) for i in final_answers])
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Mean Reciprocal Rank (MRR):", mrr)

Accuracy: 0.11
F1 Score: 0.11789069264069264
Mean Reciprocal Rank (MRR): 0.0016622039690888627


### 4.3 Prompt 3 - PoT

In [161]:
def extract_numerical_vals(text):
    integer_values = re.findall(r'\d+', text)
    return int(integer_values[-1])

In [138]:
# Define a function to generate a response
def prompt_3(question):
    input_text = f"# Question: {question} # Answer the question by implementing a solver() function \n def solver(): \n #Let's write a Python program step by step, and then return the answer"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [139]:
preds_prompt_3 = []
for question in tqdm.tqdm(pre_questions):
    pred_label = prompt_3(question)
    preds_prompt_3.append(pred_label)

100%|██████████| 200/200 [08:16<00:00,  2.48s/it]


### 4.3.1 Prompt 3 - Evaluation

In [140]:
rouge_l = calculate_rouge(preds_prompt_3, pre_ground_truth)
print("ROUGE-L:", rouge_l)

ROUGE-L: 0.13898508263213807


In [162]:
preds_prompt_3_extrct = [extract_numerical_vals(i) for i in preds_prompt_3]

In [168]:
accuracy, f1, mrr = calculate_metrics([str(i) for i in preds_prompt_3_extrct], [str(i) for i in final_answers])
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Mean Reciprocal Rank (MRR):", mrr)

Accuracy: 0.015
F1 Score: 0.012721804511278197
Mean Reciprocal Rank (MRR): 0.0006328586651167296


### 4.4.1 Prompt 4 - Target Audience

In [154]:
def extract_numerical_vals(text):
    integer_values = re.findall(r'\d+', text)
    return int(integer_values[-1])

In [141]:
# Define a function to generate a response
def prompt_4(question):
    input_text = f"# Describe the solution of this question to a 6-year-old. {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    # Generate the output tokens
    output_tokens = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return response_text

In [142]:
preds_prompt_4 = []
for question in tqdm.tqdm(pre_questions):
    pred_label = prompt_4(question)
    preds_prompt_4.append(pred_label)

100%|██████████| 200/200 [01:58<00:00,  1.69it/s]


### 4.4.1 Prompt 4 - Evaluation

In [143]:
rouge_l = calculate_rouge(preds_prompt_4, pre_ground_truth)
print("ROUGE-L:", rouge_l)

ROUGE-L: 0.20343307172080788


In [155]:
preds_prompt_4_extrct = [extract_numerical_vals(i) for i in preds_prompt_4]

In [165]:
accuracy, f1, mrr = calculate_metrics(preds_prompt_4_extrct, final_answers)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Mean Reciprocal Rank (MRR):", mrr)

Accuracy: 0.055
F1 Score: 0.0473997113997114
Mean Reciprocal Rank (MRR): 0.0007705984319997839


# 5. Facebook Galactica

In [16]:
!pip install galai

Collecting galai
  Downloading galai-1.1.6.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting transformers==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting parallelformers==1.2.7
  Downloading parallelformers-1.2.7.tar.gz (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting markdown>=3.4
  Downloading Markdown-3.4.3-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bleach[css]~=5.0.1
  Downloading bleach-5.0.1-py3-none-any.whl (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.9/160.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m

In [17]:
import galai as gal
from galai.notebook_utils import *

In [18]:
model = gal.load_model("standard")

Downloading (…)okenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/45.1k [00:00<?, ?B/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.77G [00:00<?, ?B/s]

In [19]:
# Do some experiments with 1-2 sentences and then run them for all 

In [22]:
# prompt = f"Question: A bat and a ball cost $\\$1.10$ in total. The bat costs $\\$1.00$ more than the ball. How much does the ball cost?\n\n<work>"
# display_markdown(model.generate(prompt, new_doc=True, max_new_tokens=250))