In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
import pandas as pd
import os
import sys
sys.path.insert(1, 'scripts')
import torch

## Save prediction outputs

In [2]:
## load model
model_checkpoint_path = {
    'claims': 'model/for_claims/checkpoint/train-300-t1',
    'premises': 'model/for_premises/checkpoint/train-300-p1'
}
scenario = ['auto-driving', 'bitcoin-invest']


In [3]:
for task, model_path in model_checkpoint_path.items():
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    for sc in scenario:
        data = pd.read_csv(os.path.join('data', task, f"evaluation-{sc}.csv"))
        inputs = tokenizer(data.text.to_list(), max_length=300,
                           return_tensors='pt', padding=True, truncation=True)
        summary_ids = model.generate(inputs["input_ids"])
        output = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
        df = pd.DataFrame({
            'text': data.text.tolist(),
            'ground-truth': data[task[:-1]].tolist(),
            'prediction': output
        })
        df.to_csv(os.path.join('data/prediction_results', f'prediction-results-{sc}-{task}.csv'), index=False)
    del model

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
2022-08-27 14:27:21.873102: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
# from load_data import find_xml_files, load_objects, load_data
# data = load_data()

## Fill data json file

In [46]:
import json
from evaluate_data import locate_post, create_objects, insert_objects, pre_process_prediction
from tqdm.auto import tqdm


In [57]:
web_json_path = {
    'auto-driving' : 'datasets/auto-driving/Would-you-get-into-a-self-driving-car_0820_1.json',
    'bitcoin-invest': 'datasets/bitcoin-invest/Should-I-invest-in-Bitcoin_0815_1.json'
}
task = ['claim', 'premise']


In [64]:
for sc, web_json in web_json_path.items():
    with open(web_json) as f:
        data_file = json.load(f)
    for t in task:
        prediction_result_path = f'data/prediction_results/prediction-results-{sc}-{t}s.csv'
        df = pd.read_csv(prediction_result_path)

        target_post = 0
        for i, txt in tqdm(enumerate(df.text), total=df.shape[0]):
            try:
                target_post = locate_post(data_file['answers'], txt, start_index=target_post)
                if target_post == -1:
                    raise ValueError(f"No match found in {i}th post")
            except ValueError as err:
                print(err)
            prediction = df.loc[i,'prediction']
            fill_objects = create_objects(pre_process_prediction(prediction), name=t)
            data_file['answers'][target_post][t] = fill_objects
    save_path = os.path.join(*web_json.split('/')[:-1], f"{sc}.json")
    with open(save_path, 'w') as f:
        json.dump(data_file, fp=f, indent=4, ensure_ascii=False)


100%|██████████| 66/66 [00:25<00:00,  2.61it/s]
100%|██████████| 60/60 [00:21<00:00,  2.85it/s]
100%|██████████| 45/45 [01:15<00:00,  1.68s/it]
100%|██████████| 49/49 [01:29<00:00,  1.83s/it]


In [65]:
save_path = os.path.join(*web_json.split('/')[:-1], f"{sc}.json")
save_path

'datasets/bitcoin-invest/bitcoin-invest.json'