## Document Embedding Pipeline

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
from pinecone import Pinecone, PodSpec
from tqdm import tqdm
import pandas as pd

In [None]:
data = pd.read_csv('data_distilroberta_recursive_400_50.csv')
data.head()

In [None]:
# we don't use OpenAI embedding as it costs money  multi-qa-mpnet-base-dot-v1
embedding_model = 'sentence-transformers/all-distilroberta-v1'#'sentence-transformers/all-MiniLM-L6-v2' #all-mpnet-base-v2'

device = 'cuda:0' # make sure you are on gpu
batch_size = 32
embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': batch_size}
)

In [None]:

embeddings = embed_model.embed_documents(data['text'])
print("number of docs:",len(embeddings))
print("dimension of docs:",len(embeddings[0]))

In [None]:

# Path to the CSV file
csv_file = '00embeddings.csv'

# Writing embeddings to CSV
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write each embedding as a row in the CSV file
    for embedding in embeddings:
        writer.writerow(embedding)

print("Embeddings saved to CSV file:", csv_file)

In [None]:
data['Embeddings'] = embeddings

# Path to the CSV file
csv_file = '0data_with_embeddings.csv'

# Save DataFrame to CSV
data.to_csv(csv_file, index=False)

print("Data with embeddings saved to CSV file:", csv_file)

In [None]:
for i, emb in enumerate(embeddings):
    print(f"ID: {data['id'].iloc[i]}, Embedding: {emb}, source: {data['resource'].iloc[i]}")


In [None]:
# initialize Pinecone
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
index_name = 'medical-articles-embeddings'
#initialize the index
pc.create_index(
    index_name,
    dimension=384,#len(embeddings[0]),
    metric='cosine',
    spec= PodSpec(environment="gcp-starter")
)
# Describe the index
index_name = 'medical-articles-embeddings'
index = pc.Index(index_name)
index.describe_index_stats()

In [None]:
batch_size = 32

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['id']}" for _, x in batch.iterrows()]
    texts = [x['text'] for _, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['text'],
         'resource': x['source']} for _, x in batch.iterrows()
    ]
    # metadata = [
    #     {'text': x['text']} for _, x in batch.iterrows()
    # ]
    index.upsert(vectors=zip(ids, embeds, metadata))

In [None]:
# Describe the index
index.describe_index_stats()

## Question Answering Chain

In [None]:
# Check the scores for the top 5 matches
query = 'who is Moog'

# query
results = index.query(vector=embed_model.embed_query(query), top_k=5, include_metadata=True)
for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [None]:
# Return the top N results
from langchain.vectorstores import Pinecone
vectorstore = Pinecone(index, embed_model.embed_query, 'text')

In [None]:
query = 'what is the cause of CASK Disorders?'

vectorstore.similarity_search(
    query,  # the search query
    k=3  # returns top 3 most relevant chunks of text
)