In [4]:
!pip install langchain transformers datasets faiss-cpu sentence-transformers pypdf2 SentencePiece

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=98681dbe56245ebc9adece53ca0e383152fc2dd742029cf31284a4cf6b956678
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [5]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain import FAISS
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModel, AutoModel
from datasets import Dataset
import faiss

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
%cd /content/drive/MyDrive/dlt_results

/content/drive/MyDrive/dlt_results


## Read PDF and parse

In [9]:
pdf_file = "./APD_2019_report.pdf"
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages[:20]:
  text += page.extract_text()

In [10]:
len(text)

68264

In [11]:
text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
    )
chunks = text_splitter.split_text(text)
print ("Number of chunks created",len(chunks))

Number of chunks created 166


## vectorise the chunks

In [12]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [13]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [14]:
data = {
    "text": chunks
}
# Create a Hugging Face dataset
custom_dataset = Dataset.from_dict(data)
# Map the "embeddings" column
embeddings_dataset = custom_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().numpy()[0]}
)

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

In [None]:
# import faiss

In [16]:
embeddings_dataset.add_faiss_index(column="embeddings", index_name="faiss_index")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 166
})

## Search relevant chunk with question

In [37]:
question = "What is the net income in 2019 Q2"
question_embedding = get_embeddings([question]).detach().numpy()
question_embedding.shape

(1, 768)

In [38]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "faiss_index", question_embedding, k=5
)

In [39]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
samples_df

Unnamed: 0,text,embeddings,scores
4,"non-GAAP adjustments, which the Company does n...","[-0.24434524774551392, -0.2430412322282791, -0...",30.31814
2,"2016 Q1 Q2 Q3 Q4\nSales $1,866.3 $1,777.4 $1,9...","[-0.33531346917152405, -0.08847730606794357, -...",29.80736
3,"2016 Q1 Q2 Q3 Q4\nSales $1,866.3 $1,777.4 $1,9...","[-0.33531346917152405, -0.08847730606794357, -...",29.80736
1,each measure by consolidated sales for the res...,"[-0.38162297010421753, -0.21234361827373505, -...",27.183277
0,(C) Amounts are non-GAAP financial measures. ...,"[-0.17547699809074402, -0.31990543007850647, -...",25.842625


# Inference

In [40]:
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
import torch
TOKENIZER = T5Tokenizer.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("./finqa_finetune_t5.pth/")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
MODEL.eval();

In [42]:
input_ids

tensor([[363,  19,   8,  ...,   0,   0,   0]])

In [45]:
for row in samples_df.iterrows():
    context_i = "context: " + row[1]["text"]
    question_context_tokens = TOKENIZER(question,context_i, max_length=1024, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
    input_ids = torch.tensor(question_context_tokens["input_ids"], dtype=torch.long).view(1,-1)
    print(input_ids.shape)
    with torch.no_grad():
        generated_output = MODEL.generate(
            input_ids=input_ids,
            max_length=128,  # Set the desired maximum length for generated text
            num_beams=10,    # You can adjust the number of beams for beam search
            temperature=0.1  # You can adjust the temperature for sampling,
            )
        generated_text = TOKENIZER.decode([x for x in generated_output[0] if x!= -100], skip_special_tokens=True)
        print(generated_text)

torch.Size([1, 1024])




Step 1: Subtract 357.0 $433.5 from 518.7. This gives the result: 8.1
torch.Size([1, 1024])
Step 1: Subtract 372.0 from 366.3. This gives the result: 35.1
torch.Size([1, 1024])
Step 1: Subtract 372.0 from 366.3. This gives the result: 35.1
torch.Size([1, 1024])
Step 1: Subtract 518.7 from 357.0 $ 433.5 $3.5 433.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $3.5 33.5 $
torch.Size([1, 1024])
Step 1: Add 306.4 and 135.7. This gives the result: 104.1 ####### Step 2: Divide 104.1 by 16.3%. This gives the result: 104.1


In [44]:
print(samples_df['text'][0])

(C)  Amounts are non-GAAP financial measures. See reconciliation to the comparable GAAP measures within Item 7, Management’s Discussion and Analysis of Financial Condition and Results 
 of Operations, of the accompanying Annual Report on Form 10-K.
42863_Annual_Report_2019_Narrative.indd   5 12/4/19   12:52 PMIV
Air Products  |  2019 Annual Report2017 Q1 Q2 Q3 Q4
Sales $1,882.5 $1,980.1 $2,121.9 $2,203.1
Net income $  306.4 $2,135.7 $  104.1 $  475.0
Net income margin 16.3% 107.9% 4.9% 21.6%
