# Question-Answering Demo using Scottish Widows Public Documents

## Environment

In [None]:
import os

import pandas as pd
import numpy as np

import faiss


pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel, TextEmbeddingModel


In [None]:
PROJECT_ID = ! gcloud config get core/project
PROJECT_ID = PROJECT_ID[0]

REGION = "europe-west2"

PROJECT_ID, REGION

In [None]:
%env PROJECT_ID=$PROJECT_ID

## Data

Document Source: Based on Scottish Widows' literature library search:
https://adviser.scottishwidows.co.uk/literature-library.html

Specifically for this demo, the *guides* are selected:
https://adviser.scottishwidows.co.uk/literature-library.html?n=1000&filter=swe:literaturelibrary/contenttype/guides

The pdf files are scrapped and save in local parquet file

In [None]:
all_guides_file = "../data/scottish_widows_all_guides.pq"

guides_df = pd.read_parquet(all_guides_file)

guides_df.head()

In [None]:
guides_df.groupby(["title"])[["page_number"]].count()

### Remove the blank pages

In [None]:
guides_df = guides_df.loc[guides_df["page_text"]!=""]

## Embedding using Google's `TextEmbedding` Model

**Approach 1: Using the natural pages as chunks**

In [None]:
guides_df["page_text"].loc[0]

In [None]:
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

# return a list of vertexai.language_models._language_models.TextEmbedding
#embeddings = model.get_embeddings( [guides_df["page_text"].loc[0]] )
embeddings = model.get_embeddings(guides_df["page_text"].loc[0:4]) # maximum 5 instance per embedding!

len(embeddings), type(embeddings[0])

In [None]:
for embedding in embeddings:
    vector = np.array(embedding.values)
    print(vector.shape)
    print(vector[:10])

In [None]:
pd.Series([embedding.values for embedding in embeddings], name="embedding").to_frame()

In [None]:
def get_embedding_google(se, chunk_size=5):
    """Using Google's pretrained TextEmbeddingModel to vetorise the series 
       By default, GCP can process maximum 5 itmes in one go 
    """
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    
    # generator use to iterate the series over smaller series with chunk_size rows) 
    small_se_gen = (se.iloc[i:i+chunk_size] for i in range(0, len(se), chunk_size))
    small_se_embeddings = [model.get_embeddings(small_se) for small_se in small_se_gen]

    eb_list = [
        np.array(embedding.values, dtype="float32") 
        for embeddings in small_se_embeddings 
        for embedding in embeddings
        ]

    return pd.Series(eb_list, name="embedding", index=se.index)


**To test the embdding function**

In [None]:
# one text item each time for the embedding
df1 = get_embedding_google(df["page_text"], 1)
df1.head(), df1[0][0:10]

In [None]:
# using the default chunk size of 5 
df2 = get_embedding_google(df["page_text"])
df2.head(), df2[0][0:10]

**Note: when more than one piece of text items are send for embedding, the model returns slightly different embeddings vector. But they are very similar when using the dot product to compare!**

In [None]:
[np.dot(df2[i], df3[i]) for i, _ in enumerate(df2)]

### Embedding the whole set

In [None]:
#%%timeit -n 1 -r 1 # how long does is take? about 1 min for 1000 rows
#get_embedding_google(guides_df["page_text"].iloc[0:100])

guides_embedded_df = guides_df.assign(
    embedding=get_embedding_google(guides_df["page_text"])
)
guides_embedded_df.tail(30)

In [None]:
guides_embedded_file = "../data/scottish_widows_all_guides_embedded_v2.pq"
guides_embedded_df.to_parquet(guides_embedded_file)

In [None]:
guides_df.shape, guides_embedded_df.shape, guides_embedded_df.embedding.isna().sum()

## Vector DB using Faiss

In [None]:
guides_embedded_file = "../data/scottish_widows_all_guides_embedded_v2.pq"
guides_embedded_df = pd.read_parquet(guides_embedded_file)

guides_embedded_df.tail()

### Assemble the vector array

In [None]:
guides_embedded_df = guides_embedded_df.reset_index().rename(columns={"index": "id"}) 

embedding_array = np.vstack( guides_embedded_df.embedding )

### Build the vector DB

In [None]:
# instantiate the index
vector_length = guides_embedded_df.embedding[0].size

index = faiss.IndexFlatL2(vector_length)

# Pass the index to IndexIDMap
indexed = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
indexed.add_with_ids(embedding_array, guides_embedded_df.id.values)

print(f"Number of vectors in the Faiss index: {indexed.ntotal}")

## Query

In [None]:
pick_page = 3

em = embedding_array[pick_page:pick_page+1, :]
distances, ids = indexed.search(em, k=3)
print(f'L2 distance: {distances[0]}\nIDs: {ids[0]}')

In [None]:
guides_embedded_df[ guides_embedded_df.id.isin(ids[0])]

In [None]:
def vector_search_google(query, index, num_results=2):
    """
    Returns:
        distances: distances between results and query as float or numpy array.
        ids: IDs of the maches as array.
    
    """

    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    
    query_vector = np.array(model.get_embeddings([query])[0].values, dtype="float32").reshape(1, -1)

    distances, ids = index.search(query_vector, k=num_results)
    
    return distances, ids


In [None]:
user_query = """
How does the Discounted Gift & Income Trust work?
"""


In [None]:
ds, ids = vector_search_google(user_query, indexed, num_results=3)

print(f'Euclidean distance: {d[0]}\nPage IDs: {id[0]}')

In [None]:
# Fetch the paper titles based on their index
guides_embedded_df[ guides_embedded_df["id"].isin(ids[0])]

## Answer the query based on the relevant pages

In [None]:
context = guides_embedded_df["page_text"][ guides_embedded_df["id"].isin(ids[0])].iloc[0]
question = """How does the Discounted Gift & Income Trust work?"""

template = f"""You are an expert having a conversation with a user.
Given the following extracted parts of a long document and a question,
create a final answer. 
{context}

user: {question}
expert:
"""

parameters = {
    "temperature": 0.2,
    "max_output_tokens": 256,   
    "top_p": .8,                
    "top_k": 40,                 
}

model = TextGenerationModel.from_pretrained("text-bison@001")
response = model.predict(template, **parameters)

print(f"Response from Model: \n{response.text}")


In [None]:
def gen_text_google(input_text, temperature: float=0.2) -> None:
    parameters = {
        "temperature": temperature,
        "max_output_tokens": 256,   
        "top_p": .8,                
        "top_k": 40,                 
    }

    model = TextGenerationModel.from_pretrained("text-bison@001")
    response = model.predict(
        input_text,
        **parameters,
    )
    print(f"Response from Model: \n{response.text}")


In [None]:
question = """How does the Discounted Gift & Income Trust work?"""

ds, ids = vector_search_google(question, indexed, num_results=3)

context = guides_embedded_df["page_text"][ guides_embedded_df["id"].isin(ids[0])].iloc[0]

style = "a concise way"
#style = "details"

text = f"""You are an expert having a conversation with a user.
Given the following extracted parts of a long document and a question,
create a final answer in {style}. 
{context}

user: {question}
expert:
"""

gen_text_google(text)

## Scratch

In [None]:
# Create a Pandas DataFrame
data = {'name': ['John', 'Jane', 'Mike', 'Susan', 'Peter'],
        'age': [20, 25, 30, 35, 40]}
df = pd.DataFrame(data)

df

In [None]:
np.array_split(df, 5)
#np.array_split(df, 3)
#df.shape

In [None]:
# Print the first five rows of each DataFrame
for df in df_list:
    print(df.head())

In [None]:
df = pd.DataFrame( {
     'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
     'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
     'C' : np.random.randn(8), 
     'D' : np.random.randn(8)}
)

df

In [None]:
np.array_split(df, 3)


In [None]:
len(df.embedding)//5+1

In [None]:
_ =[print(s) for s in np.array_split(df.embedding, len(df.embedding)//5+1)]


In [None]:
df

In [None]:
n=5

g = (df["title"].iloc[i:i+n] for i in range(0, len(df), n))



In [None]:
%%timeit -n 1 -r 5
#dd = pd.DataFrame.from_records(guides_embedded_df.embedding) # slow
dd = np.vstack( guides_embedded_df.embedding )

dd.shape

In [None]:
dd.shape

In [None]:
dd

In [None]:
a = [1,2,3]
a


In [None]:
list(a)

In [None]:
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

query_vector = np.array(model.get_embeddings([user_query])[0].values, dtype="float32").reshape(1, -1)

query_vector.shape
D, I = indexed.search(query_vector, k=1)


In [None]:
type(query_vector)

In [None]:
np.array(query_vector, dtype="float32")