In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
import pandas as pd
import os
import sys
sys.path.insert(1, 'scripts')
import torch

## Save prediction outputs

In [2]:
## load model
model_checkpoint_path = {
    'claims': 'model/for_claims/checkpoint/train-300-t1',
    'premises': 'model/for_premises/checkpoint/train-300-p1'
}
scenario = ['auto-driving', 'bitcoin-invest']


In [3]:
for task, model_path in model_checkpoint_path.items():
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    for sc in scenario:
        data = pd.read_csv(os.path.join('data', task, f"evaluation-{sc}.csv"))
        inputs = tokenizer(data.text.to_list(), max_length=300,
                           return_tensors='pt', padding=True, truncation=True)
        summary_ids = model.generate(inputs["input_ids"])
        output = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
        df = pd.DataFrame({
            'text': data.text.tolist(),
            'ground-truth': data[task[:-1]].tolist(),
            'prediction': output
        })
        df.to_csv(os.path.join('data/prediction_results', f'prediction-results-{sc}-{task}.csv'), index=False)
    del model

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
2022-08-27 14:27:21.873102: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
# from load_data import find_xml_files, load_objects, load_data
# data = load_data()

## Fill data json file

In [35]:
import json
from evaluate_data import locate_post, create_objects, insert_objects
from tqdm.notebook import tqdm

In [10]:
df = pd.read_csv(
    'data/prediction_results/prediction-results-auto-driving-claims.csv')
with open('datasets/auto-driving/Would-you-get-into-a-self-driving-car_0820_1.json') as f:
    data_file = json.load(f)


In [36]:
target_post = 0
for i, txt in tqdm(enumerate(df.text)):
    try:
        target_post = locate_post(data_file['answers'], txt, start_index=target_post)
        if target_post == -1:
            raise ValueError(f"No match found in {i}th post")
    except ValueError as err:
        print(err)
    prediction = df.loc[i,'prediction']
    fill_objects = create_objects(prediction)
    data_file['answers'] = insert_objects(
        data_file['answers'], fill_objects, target_post, 'claim')


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

No match found in 32th post



In [4]:
model_path = 'model/for_claims/checkpoint/train-300-t1'
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
input_text = "Try thinking of investing in bitcoin as you would buying a lottery ticket. It only costs a dollar, but you could win big. However, as historically shown with commodities, the odds are good that you're going to lose money compared with a low-cost, diversified investment.Bitcoin is open to everyone and provides an exciting opportunity to delve into an entirely new asset class. Investing in bitcoin may seem scary, but know that it takes time and effort to understand how Bitcoin works. Also keep in mind that the Bitcoin are varied. Keep that in mind, and do your own research based on where you live."

inputs = tokenizer(input_text,
                   return_tensors='pt', padding=True, truncation=True)
summary_ids = model.generate(inputs["input_ids"])
output = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)


In [14]:
output

['Try thinking of investing in bitcoin as you would buying a lottery ticket. It only costs a dollar\nkeep that in mind, and do your own research based on where you live.']

In [18]:
inputs = tokenizer.encode_plus(input_text, max_length=300, padding=True, add_special_tokens=True, return_tensors="pt")
summary_ids = model.generate(
    inputs['input_ids'], max_length=150, do_sample=True, top_p=0.95, top_k=60)
inferences = []
for i,x in enumerate(summary_ids):
    inferences.append(tokenizer.decode(summary_ids[i], skip_special_tokens=True))

In [19]:
inferences

['Try thinking of investing in bitcoin as you would buying a lottery ticket. It only costs a dollar\nbecause of it takes time and effort to understand how Bitcoin works.\nkeep that in mind, and do your own research based on where you live.']

inferences