In [None]:
!pip install -U datasets

In [2]:
from datasets import load_dataset

financial_dataset = load_dataset("sujet-ai/Sujet-Finance-QA-Vision-100k")
financial_dataset = financial_dataset.remove_columns("image")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
split = financial_dataset["train"].train_test_split(test_size = 0.1)
financial_dataset['train'] = split['train']
financial_dataset['validation'] = split['test']
financial_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'content', 'qa_pairs'],
        num_rows: 8290
    })
    test: Dataset({
        features: ['doc_id', 'content', 'qa_pairs'],
        num_rows: 589
    })
    validation: Dataset({
        features: ['doc_id', 'content', 'qa_pairs'],
        num_rows: 922
    })
})

In [4]:
def evaluate(example):
    return {'qa_pairs': eval(example['qa_pairs'])}

for split in financial_dataset.keys():
    financial_dataset[split] = financial_dataset[split].map(
        evaluate
    )

Map: 100%|██████████| 8290/8290 [00:01<00:00, 6806.71 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 6406.74 examples/s]


In [6]:
from datasets import DatasetDict

def explode_pairs(examples):
    batch = []
    for i in range(len(examples['qa_pairs'])):
        for idx, dict in enumerate(examples['qa_pairs'][i]):
            new_dict = {k: v for k, v in dict.items()}
            new_dict['content'] = examples['content'][i]
            batch.append(new_dict)

    return {k: [dic[k] for dic in batch] for k in batch[0]}


exploded_dataset = DatasetDict()
for split in financial_dataset.keys():
    exploded_dataset[split] = financial_dataset[split].map(
        explode_pairs,
        remove_columns=financial_dataset[split].column_names,
        batched=True
    )

Map: 100%|██████████| 8290/8290 [00:00<00:00, 9418.17 examples/s] 
Map: 100%|██████████| 922/922 [00:00<00:00, 8625.39 examples/s]


In [None]:
exploded_dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'answer', 'question'],
        num_rows: 90520
    })
    test: Dataset({
        features: ['content', 'answer', 'question'],
        num_rows: 6421
    })
    validation: Dataset({
        features: ['content', 'answer', 'question'],
        num_rows: 10109
    })
})

In [7]:
def concatenate_text(examples):
    return {
        "qa_retrieval": examples["question"]
        + " \n "
        + examples["answer"]
    }

dataset = exploded_dataset['train'].map(concatenate_text).shuffle(seed = 42).select(range(10000))

Map: 100%|██████████| 90542/90542 [00:02<00:00, 34233.84 examples/s]


In [8]:
from transformers import AutoTokenizer, AutoModel

model_checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)

In [9]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [10]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding = True, truncation = True, return_tensors = "pt"
    )
    encoded_input = {k: v.to(device) for k,v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)


In [11]:
embedding = get_embeddings(dataset["qa_retrieval"][0])

In [12]:
embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x['qa_retrieval']).detach().cpu().numpy()[0]}
)


Map: 100%|██████████| 10000/10000 [01:37<00:00, 102.54 examples/s]


In [21]:
embeddings_dataset.add_faiss_index(column = "embeddings")

question = "What are the estimated costs?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

100%|██████████| 10/10 [00:00<00:00, 322.19it/s]


(1, 768)

In [22]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k = 2
)

In [25]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending = False, inplace = True)

for _, row in samples_df.iterrows():
    print(f"Content: {row.content}")
    print(f"Question: {row.question}")
    print(f"Answer: {row.answer}")
    print("=" * 50)
    print()


Content: ### Document Type:
This is an Estimate issued by Promotional Marketing, Inc.

### Key Details:
- **Recipient:** Mr. Dan Alcazar, R.J. Reynolds Tobacco Co., 401 North Main Street, Winston-Salem, NC 27102
- **Issuer:** Promotional Marketing, Inc., 152 West Huron Street, Chicago, Illinois
- **Date:** 8/23/90
- **Job Number:** 07560-C/E
- **Project:** EIP Boilerplate Base Newspaper Ads
- **Description:**
  - Creative development and mechanical art preparation for newspaper ads associated with the Exchange Initiative Program.
  - Efforts include computerized layout of 4 separate boilerplate designs to support 2 different size newspapers, and 2 ad sizes for each.
  - Mechanical art tasks include skyline assembly, type, stats, and shipping for 4 separate pieces.
- **Cost Details:**
  - Creative Development and Design: $2,440.00
  - Mechanical Art Preparation: $5,830.00
  - **Total Estimate:** $8,270.00
- **Estimated Completion Date:** Week of 9/3/90
- **Due Date for Components:** 9/0