In [1]:
! pip install snowflake-connector-python
! pip install pinecone-client
! pip install python-dotenv
! pip install openai



In [3]:
from pinecone import Pinecone, PodSpec, ServerlessSpec
from dotenv import load_dotenv
import os
import snowflake.connector
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

# Load environment variables
load_dotenv(verbose=True, override=True)


True

In [4]:
os.getenv('pinecone_api_key')



'c181a9c7-04eb-4294-b9da-bdcb1ee4e502'

In [5]:
# Snowflake connection parameters
snowflake_user = os.getenv('snowflake_user')
snowflake_password = os.getenv('snowflake_password')
snowflake_account = os.getenv('snowflake_account')
database = os.getenv('snowflake_database')
schema = os.getenv('snowflake_schema')
table_name = os.getenv('table_name')

# Function to fetch data from Snowflake
def fetch_data_from_snowflake():
    ctx = snowflake.connector.connect(
        user=snowflake_user,
        password=snowflake_password,
        account=snowflake_account,
        database=database,
        schema=schema
    )
    cur = ctx.cursor()
    try:
        cur.execute(f"SELECT TITLE, CONTENT FROM {table_name}")
        rows = cur.fetchall()
        return [(row[0],row[1]) for row in rows] 
    finally:
        cur.close()
        ctx.close()

In [6]:
#Display connection results
cur = fetch_data_from_snowflake()
cur

[('A guide to clinical trials for cancer',
  'A guide to clinical trials for cancer\n     \n\n\nIf you have cancer, a clinical trial may be an option for you. A clinical trial is a study using people who agree to participate in new tests or treatments. Clinical trials help researchers know whether a new treatment works well and is safe. Trials are available for many cancers and all stages of cancer, not just advanced cancer. If you join a trial, you may get treatment that can help you. Plus, you will help others to learn more about your cancer as well as new tests or treatments. There are many things to consider before joining a trial. Learn about why you might want to enroll in a clinical trial and where to find one.What is a Clinical Trial for Cancer?\n\nClinical trials for cancer look at ways to:Prevent cancerScreen or test for cancerTreat or manage cancerReduce symptoms or side effects of cancer or cancer treatmentsA clinical trial will recruit many people to participate. During th

In [7]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv('pinecone_api_key'))
#index=pc.Index(host='https://perclias-t4qmacc.svc.gcp-starter.pinecone.io')
#index_name = 'perclias'  # Choose a unique name for your Pinecone index

# OpenAI API key
openai.api_key = os.getenv('openai_api_key')



In [8]:
pc.list_indexes()


{'indexes': [{'dimension': 1536,
              'host': 'perclias-t4qmacc.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'perclias',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [9]:
#Chunk the data
def split_text(text, chunk_size=512, chunk_overlap=96):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(text)
    return chunks

In [41]:
# Function to generate embeddings
def generate_embeddings(chunks):
    embeddings = []
    for chunk in tqdm(chunks):
        response = openai.Embedding.create(
            input=[chunk], 
            #model="text-embedding-3-large"
            model="text-embedding-3-small"  # Example: using a specific embedding model
        )
        embeddings.append(response['data'][0]['embedding'])
    return embeddings

# Function to remove non-ASCII characters
def remove_non_ascii(text):
    return ''.join(char for char in text if ord(char) < 128)

In [42]:
# Fetch data from Snowflake
data_pairs = fetch_data_from_snowflake()



# Split text into chunks

chunks = []
index=0
for title, content in tqdm(data_pairs):
    combined_text = title + " " + content
    chunks.extend(split_text(combined_text))



100%|██████████| 4464/4464 [00:02<00:00, 1656.32it/s]


Embedding, Pinecone Index, And Upload data



In [43]:
embeddings = generate_embeddings(chunks)

#Delete if the index exists
index_name = "perclias"
for index in pc.list_indexes():
    if index.name == index_name:
        pc.delete_index(index_name)
pc.create_index(name=index_name, dimension=len(embeddings[0]), spec=PodSpec(environment="gcp-starter"),namespace="perclias_ns")
index = pc.Index(name=index_name)

# Prepare data for insertion into Pinecone
data_to_insert = [{'id': remove_non_ascii(chunk), 'values':embedding} for chunk, embedding in tqdm(zip(chunks, embeddings))]



# Upload data in batches
for i in tqdm(data_to_insert):
    
    index.upsert(vectors=[i])


  1%|          | 475/63910 [02:01<4:30:46,  3.90it/s] 


KeyboardInterrupt: 

In [195]:
len(data_to_insert)



61893