# Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',  force_remount=True)
proparalogy_folder_path = 'gdrive/MyDrive/proparalogy'

Mounted at /content/gdrive


# GPU details

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jul 25 20:46:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import gc

# Clear GPU memory
def clear_gpu_memory():
  torch.cuda.empty_cache()
  gc.collect()

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


# Install Libs

In [None]:
!pip install transformers --quiet
!pip install torch --quiet
!pip install openai --quiet
!pip install accelerate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# Config

In [None]:
import io
import os
import json

def create_folder(path):
  if not os.path.exists(path):
      os.makedirs(path)
  return path

def create_json(path):
  if not os.path.isfile(path):
    print('Creating json ', path)
    with io.open(path, 'w+') as json_file:
      json_file.write(json.dumps({}))
  return path

PROJECT_PATH = proparalogy_folder_path
ASSETS_DISTRACTORS = create_folder(PROJECT_PATH + '/distractors/')
PROMPTS_PATH = create_folder(PROJECT_PATH + '/prompts/')
MODEL_RESULTS_PATH = create_folder(PROJECT_PATH + '/models_results')
EXPERIMENTS_PATH = create_folder(PROJECT_PATH + '/experiments')
EXPERIMENTS_RESULTS_PATH = create_folder(PROJECT_PATH + '/experiments_results')

columns_to_serialize = ['candidates']

GPT_CACHE_PATH = create_json(PROJECT_PATH + '/gpt_response_cache.json')
FLAN_T5_CACHE_PATH = create_json(PROJECT_PATH + '/flan_t5_prediction_cache.json')
OPEN_AI_KEY = open(PROJECT_PATH + '/OPEN_AI_KEY.txt', 'r').read()

MULTIPLE_CHOICE_PROMPT_PARAGRAPHS = open(PROJECT_PATH + '/prompts/multiple_choice_prompt_dafna.txt', 'r').read()
BINARY_PROMPT_PARAGRAPHS = open(PROJECT_PATH + '/prompts/binary_prompt_dafna.txt', 'r').read()

MODELS = {
    'ChatGPT': 'gpt-3.5-turbo',
    'GPT4': 'gpt-4',
    'FLAN-T5-small': 'google/flan-t5-small',
    'FLAN-T5-xl': 'google/flan-t5-xl',
    'FLAN-T5-xxl': 'google/flan-t5-xxl'
}

In [None]:
BINARY_PROMPT_PARAGRAPHS

'In this task, you\'ll be given two paragraphs that describe scientific processes. Your goal is to decide whether the processes are analogous.\nAnalogy is a mapping in which the objects of one process are structurally aligned with the objects of another.\nIt is based on similarity of the relationships between the objects and the roles they play throughout the process, and not on the similarity between object attributes.\nFor example, there is an analogy between a paragraph about "How does an electrical circuit work?", and a paragraph about "How does a mechanical water pump work?". In this analogy, electrons are mapped to water: both start at some state (low voltage/low pressure), then move through something (wire/pipe), and change their state (high voltage/high pressure) because of another object (battery/pump).\nSimilar first order relations between the domains include:\n(battery, creates, electrical voltage) like (pump, generates, pressure)\n(electrons, move through, copper wire) lik

# Utils

In [None]:


import re
import numpy as np

MULTIPLE_CHOICE_TASK_TYPE = 'MULTIPLE_CHOICE'
BINARY_TASK_TYPE = 'BINARY'

def convert(o):
    if isinstance(o, np.int64) or isinstance(o, np.int32):
        return int(o)
    raise TypeError


def dump_json(file_path, data, indent=4, sort_keys=True):
    with open(file_path, 'w') as f:
        string_json = json.dumps(data)
        f.write(string_json)
        f.close()

def get_json(file_path, create_if_not_exist=False):
    with open(os.path.join(file_path), encoding="utf8") as f:
        response = json.load(f)
        f.close()
        return response

def json_dumps_df(df, columns_to_serialize):
    for c in columns_to_serialize:
        if c in df:
            df[c] = df[c].apply(json.dumps)
    return df

def extract_multuiple_choice_candidate_from_model_response(response):
  try:
    response = response.strip().lower()
    response_candidates = re.findall(r'\bc\d+\b', response)
    assert len(response_candidates) == 1
    return response_candidates[0].upper()
  except:
    print(response)
    return None

def get_multiple_choice_task_prompt(r, candidates, is_one_shot, is_few_shot, is_relations):
    return MULTIPLE_CHOICE_PROMPT_PARAGRAPHS.format(r['source_paragraph'], *candidates)

def get_binary_task_prompt(r, candidates, is_one_shot, is_few_shot, is_relations):
    return BINARY_PROMPT_PARAGRAPHS.format(r['source_paragraph'], *candidates)

def get_task_type(task_name):
  if 'multiple_choice_task' in task_name.lower():
    return MULTIPLE_CHOICE_TASK_TYPE
  if 'binary_task' in task_name.lower():
    return BINARY_TASK_TYPE
  else:
    raise Exception('Task name should contain either \"multiple_choice_task\" or \"binary_task\" corresponding to the task type.')

# Task Evaluators

In [None]:



import json
import random
import re
from abc import (
    ABC,
    abstractmethod,
)
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

_TASK_MULTIPLE_CHOICE = 'multiple_choice_classification'
_TASK_BINARY_CLASSIFICATION = 'binary_classification'

class TaskEvaluator(ABC):
    def __init__(self, name):
        self.name = name
        self.num_of_correct_answers = None

    def get_eval_metrics(self):
        return {'accuracy': accuracy_score, 'f1 score': f1_score}

    @abstractmethod
    def get_ground_truth(self, r):
        pass

    @abstractmethod
    def get_predictions(self):
        pass

    @abstractmethod
    def get_prompt(self):
        pass

    @abstractmethod
    def parse_model_prediction(self):
        pass

    def process_data(self, data):
        return data

    def get_ground_truth(self, r):
        return r['ground_truth']

    def get_scores(self, models_predictions_data):
        all_predictions = []
        mapped_scores = {}
        for model, model_predictions in models_predictions_data.items():
          ground_truths = list(map(lambda model_prediction: model_prediction['ground_truth'], model_predictions))
          model_predictions = list(map(lambda model_prediction: model_prediction[model], model_predictions))
          all_predictions.extend(model_predictions)

          for metric_name, metric in self.get_eval_metrics().items():
            eval_metric_score = metric(ground_truths, model_predictions)
            mapped_scores[f'metric_{metric_name}_score_{model}'] = round(eval_metric_score * 100, 2)
        return pd.Series(mapped_scores), pd.DataFrame(all_predictions)


    def get_row_candidates(self, r, index):
        return json.loads(r['shuffled_candidates'])


class TaskMultipleChoiceClassification(TaskEvaluator):
    _CLASS = 'class'

    def __init__(self, name):
        self.name = name
        self.num_of_correct_answers = 1

    def get_predictions(self, model, r, candidates, is_one_shot, is_few_shot, is_relations):
        prompt = self.get_prompt(r, candidates, is_one_shot, is_few_shot, is_relations)
        model_response = model.get_model_prediction(prompt)
        model_answer = extract_multuiple_choice_candidate_from_model_response(model_response)
        answer_candidate, answer_candidate_index =  self.parse_model_prediction(model_answer, candidates, r["sample_id"], model)
        return answer_candidate, answer_candidate_index, prompt, model_response

    def get_prompt(self, r, candidates, is_one_shot, is_few_shot, is_relations):
        return get_multiple_choice_task_prompt(r, candidates, is_one_shot, is_few_shot, is_relations)

    def parse_model_prediction(self, answer, candidates, instance_id, model):
        if answer is not None:
            candidate_index = int(answer[1:])
            assert answer.startswith('C') and 1 <= candidate_index <= len(candidates)  # valid candidate of C{int}
            return candidates[candidate_index - 1], answer  # array index for C1 is 0
        else:
            print(f'Model {model} failed to predict answer on {instance_id} choosing random candidate.')
            answer_candidate = random.choice(candidates)
            return answer_candidate, f'C{str(candidates.index(answer_candidate))}'

    def get_eval_metrics(self):
        return {'accuracy': accuracy_score}

class TaskBinaryClassification(TaskEvaluator):
    _CLASS = 'class'

    def __init__(self, name):
        self.name = name
        self.num_of_correct_answers = 1

    def get_prompt(self, r, candidates, is_one_shot, is_few_shot, is_relations):
        return get_binary_task_prompt(r, candidates, is_one_shot, is_few_shot, is_relations)

    def parse_model_prediction(self, model, model_response, sample_id):
      try:
        model_answer = int(model_response.replace('<pad>', '').replace('</s>', ''))
        assert model_answer in [0, 1]
        return model_answer
      except:
        print(f'Model {model} failed to return answer in requsted format (sample id: {sample_id}). Instead it retruned \"{model_response}\"')
        return random.randint(0,1)

    def get_predictions(self, model, r, candidates, is_one_shot, is_few_shot, is_relations):
        prompt = self.get_prompt(r, candidates, is_one_shot, is_few_shot, is_relations)
        model_response = model.get_model_prediction(prompt)
        answer_candidate =  self.parse_model_prediction(model, model_response, r["sample_id"])
        return answer_candidate, answer_candidate, prompt, model_response

    def get_row_candidates(self, r, index):
        return [r['target_paragraph']]


task_map = {_TASK_MULTIPLE_CHOICE: TaskMultipleChoiceClassification, _TASK_BINARY_CLASSIFICATION: TaskBinaryClassification}

def get_task_evaluator(task_type):
  if task_type == MULTIPLE_CHOICE_TASK_TYPE:
    return task_map[_TASK_MULTIPLE_CHOICE]
  if task_type == BINARY_TASK_TYPE:
    return task_map[_TASK_BINARY_CLASSIFICATION]
  else:
    raise Exception('No TaskEvaloator found, invalid task type.')

# GPT utils

In [None]:
import openai

openai.api_key = OPEN_AI_KEY #OPEN_AI_KEY
GPT_CACHE = get_json(GPT_CACHE_PATH)

def init_gpt_cache():
    global GPT_CACHE
    GPT_CACHE = get_json(GPT_CACHE_PATH)

def get_chatgpt_3_5_multiple_choice_prediction(prompt):
    try: # in case model is overloaded
        return get_chat_gpt_prediction(prompt, MODELS['ChatGPT'])
    except:
        return get_chat_gpt_prediction(prompt, MODELS['ChatGPT'])

def get_gpt_4_multiple_choice_prediction(prompt):
    return get_chat_gpt_prediction(prompt, MODELS['GPT4'])

def get_chat_gpt_prediction(prompt, model):
    cached_response = get_gpt_response_from_cache(prompt, model)

    if cached_response is not None:
        response = cached_response.get('choices')[0].get('message').get('content')
        return response

    output = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    save_gpt_response_in_cache(prompt, model, output)
    response = output.get('choices')[0].get('message').get('content')
    return response

def get_gpt_cache_id(prompt, model):
    return f"{prompt}___{model}___{1}".replace('\r', '')

def get_gpt_response_from_cache(prompt, model):
    id = get_gpt_cache_id(prompt, model)
    return GPT_CACHE.get(id)

def save_gpt_response_in_cache(prompt, model, response):
    GPT_CACHE.update({get_gpt_cache_id(prompt, model): response})
    dump_json(GPT_CACHE_PATH, GPT_CACHE)


# FLAN T5 Utils

In [None]:
FLAN_CACHE = get_json(FLAN_T5_CACHE_PATH)
# This is the defualt value of model.generate() function
# Provided due to deprication warning
MAX_TOKENS=20

def init_flan_cache():
    global FLAN_CACHE
    FLAN_CACHE = get_json(FLAN_T5_CACHE_PATH)

def get_flan_t5_multiple_choice_predictions(prompt, model, tokenizer, model_name):
    cached_response = get_flan_t5_response_from_cache(prompt, model_name)

    if cached_response is not None:
        return cached_response

    with torch.no_grad():
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
        outputs = model.generate(input_ids, max_length=MAX_TOKENS)
    model_prediction = tokenizer.decode(outputs[0])
    save_flan_t5_response_in_cache(prompt, model_name, model_prediction)
    return model_prediction

def get_flan_t5_cache_id(prompt, model):
    return f"{prompt}___{model}___{MAX_TOKENS}"

def get_flan_t5_response_from_cache(prompt, model):
    id = get_flan_t5_cache_id(prompt, model).replace('\r', '')
    return FLAN_CACHE.get(id)

def save_flan_t5_response_in_cache(prompt, model, response):
    FLAN_CACHE.update({get_flan_t5_cache_id(prompt, model): response})
    dump_json(FLAN_T5_CACHE_PATH, FLAN_CACHE)




# Zero-Shot Model Class

In [None]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda")
print(f"device: {device}")


class ZeroShotModel:
    def __init__(self, model_description):
        self.model_description = model_description
        if 'T5' in self.model_description:
            self.model = AutoModelForSeq2SeqLM.from_pretrained(MODELS[self.model_description], torch_dtype=torch.float16)
            self.tokenizer = AutoTokenizer.from_pretrained(MODELS[self.model_description])
            self.model.to(device)

    def get_model_prediction(self, prompt):
        if self.model_description == 'ChatGPT':
            response = get_chatgpt_3_5_multiple_choice_prediction(prompt)
        elif self.model_description == 'GPT4':
            response = get_gpt_4_multiple_choice_prediction(prompt)
        elif 'T5' in self.model_description:
            response = get_flan_t5_multiple_choice_predictions(prompt, self.model, self.tokenizer, self.model_description)
        else:
            raise Exception()
        return response

device: cuda


# Run Zero-shot

In [None]:
import argparse
import json
import os
import pandas as pd
from tqdm import tqdm

init_gpt_cache()
init_flan_cache()

def main(args):
    print(f"TASK: {args.task}")

    # This is for comparing the print - remove on prod
    print_list = {'ground_truth': []}
    models_predictions_data = {}
    for model_name, model in models_list.items():
      print_list.update({model_name: []})
      models_predictions_data.update({model_name: []})
    # ------------------------------------------------

    task_evaluator = get_task_evaluator(args.task_type)(args.task)
    df = pd.read_csv(f'{EXPERIMENTS_PATH}/{args.task}.csv', encoding='utf8')
    # args.task = task + ('_with_paragraphs' if IS_PROMPT_WITH_PARAGRAPHS else '_no_paragraphs')
    print(f"Read {len(df)}")
    df = task_evaluator.process_data(df)
    predictions_metadata = []
    for idx, (r_idx, r) in enumerate(tqdm(df.iterrows(), desc=f'Solving Dataset ({args.task})', total=len(df))):
        candidates = task_evaluator.get_row_candidates(r, idx)
        ground_truth = task_evaluator.get_ground_truth(r)
        # Remove in prod
        print_list.get('ground_truth').append(ground_truth)
        # --------------
        for model_name, model in models_list.items():
            prediction, predicted_labels, prompt, response = task_evaluator.get_predictions(model, r, candidates, args.one_shot, args.few_shot, args.relations)
            predictions_metadata.append(get_prediction_data_dictionary(r, model_name, prediction, predicted_labels, ground_truth, prompt, response))
            models_predictions_data.get(model_name).append({model.model_description: predicted_labels, 'ground_truth': ground_truth})

            # Remove in prod
            print_list.get(model_name).append(predicted_labels)
            # --------------


        if args.debug and idx == 99:
            print(f"Debug, exiting")
            break

    export_results(task_evaluator, args, models_predictions_data)

    predictions_metadata_path = os.path.join(EXPERIMENTS_RESULTS_PATH, f"{args.task}_predictions_metadata.csv")
    predictions_metadata = pd.DataFrame(predictions_metadata)
    predictions_metadata.to_csv(predictions_metadata_path, index=True, encoding='utf-8')

    ####### print list - remove in prod #########
    for model_name in print_list.keys():
      if model_name == 'ground_truth':
        continue
      print(f'\n\nModel {model_name}')
      print(print_list.get(model_name))
      print(print_list.get('ground_truth'))
      print()
    # -------------------------------------------
    print("Done")


def get_prediction_data_dictionary(r, model_name, prediction, predicted_labels, ground_truth, prompt, response):
   csv_task_row = json.loads(r.to_json())
   return {**csv_task_row, 'model': model_name, 'prediction': prediction, 'prediction_index': predicted_labels, 'ground_truth': ground_truth
           , 'prompt': prompt, 'response': response}


def export_results(task_evaluator, args, models_predictions_data):
    scores, predictions = task_evaluator.get_scores(models_predictions_data)
    predictions = json_dumps_df(predictions, columns_to_serialize)
    print(f"\n---- Results - {args.task} ----\n")
    print(scores)
    out_p = os.path.join(EXPERIMENTS_RESULTS_PATH, f"predictions_{args.task}.csv")
    print(f"\nWriting predictions to {out_p}")
    predictions.to_csv(out_p, index=False, encoding='utf-8')
    out_p_mean = os.path.join(EXPERIMENTS_RESULTS_PATH, f"score_results_{args.task}.csv")
    print(f"Writing score results to {out_p_mean}")
    scores.to_csv(out_p_mean, index=True, encoding='utf-8')


def initialize_models(args):
    global models_list
    try:
      if models_list is None:
        models_list = {}
    except:
      models_list = {}
    for model in args.models_to_run:
      if models_list.get(model) is None:
        zeroshot_model = ZeroShotModel(model)
        models_list[model] = zeroshot_model

if __name__ == '__main__':
    for task in [
      "data_for_eval_balanced_shuffled_binary_task"
      "data_for_eval_random_candidates_multiple_choice_task",
    ]:
        parser = argparse.ArgumentParser()
        print(f'Task: {task}')
        parser.add_argument('--models_to_run', nargs="+", default=list(MODELS.keys()))
        parser.add_argument('--task', default=task)
        parser.add_argument('--task_type', default=get_task_type(task))
        parser.add_argument('--one_shot', default=False)
        parser.add_argument('--few_shot', default=False)
        parser.add_argument('--relations', default=False)
        parser.add_argument("--debug", action='store_const', default=False, const=True)
        args = parser.parse_args(args=[])

        print(args)
        initialize_models(args)

        main(args)
