In [39]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [21]:
import pinecone
import openai
import mysql.connector
import json

In [22]:
mysql_config = {
    'user': os.environ.get('DB_USER'),
    'password': os.environ.get('DB_PASSWORD'),
    'host': os.environ.get('DB_HOST'),
    'database': os.environ.get('DB_NAME')
}

conn = mysql.connector.connect(**mysql_config)
cursor = conn.cursor()

query = """
    SELECT * FROM `MercorUserProfile`
"""
cursor.execute(query)

# Fetch all results
results = cursor.fetchall()

# Close MySQL connection
cursor.close()
conn.close()

In [84]:
from pinecone import Pinecone
from tqdm import tqdm

from langchain.embeddings.openai import OpenAIEmbeddings

openai_api_key = os.environ.get('OPENAI_API_KEY') or 'OPENAI_API_KEY'
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=openai_api_key
)

api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
INDEX_NAME = "mercor-user-profiles"

index = pc.Index(INDEX_NAME)

dataset = []

def return_as_dict(result):
    return {
        "id": result[0],
        "metadata": {
            "userId": result[0],
            "name": result[1],
            "email": result[2],
            "phone": result[3],
            "fullTimeSalary": result[4],
            "partTimeSalary": result[5],
            "workExperience": result[6],
            "education": result[7],
            "skills": result[8],
        },
        "values": result[9],
    }
    
for result in tqdm(results[5:]):
    result = list(result)
    result.append(embed.embed_query(
        "Skills: " + result[8] + " Education: " + result[7] + " Work Experience: " + result[6] + " Full Time Salary: " + str(result[4]) + " Part Time Salary: " + str(result[5])
    ))
    dataset.append(return_as_dict(result))
    
    
    
import pandas as pd
df = pd.DataFrame(dataset)
df.head()

100%|██████████| 195/195 [01:47<00:00,  1.81it/s]


Unnamed: 0,id,metadata,values
0,001575b9-6ec3-11ee-8bff-42010a400007,{'userId': '001575b9-6ec3-11ee-8bff-42010a4000...,"[0.015586048309392938, -0.002184512085919989, ..."
1,00158a5b-6eb5-11ee-8bff-42010a400007,{'userId': '00158a5b-6eb5-11ee-8bff-42010a4000...,"[0.018540523346221686, 0.008370301415322731, 0..."
2,001c5b6a-6ea4-11ee-8bff-42010a400007,{'userId': '001c5b6a-6ea4-11ee-8bff-42010a4000...,"[0.004046677004481127, -0.0029472607553426636,..."
3,001fc599-6ec1-11ee-8bff-42010a400007,{'userId': '001fc599-6ec1-11ee-8bff-42010a4000...,"[0.014763144913046677, 0.0009588470050068443, ..."
4,002177c9-6ea9-11ee-8bff-42010a400007,{'userId': '002177c9-6ea9-11ee-8bff-42010a4000...,"[0.009064803099095329, -0.0017160296351446354,..."


In [103]:
# UPSERT THE INDEX
index.upsert_from_dataframe(df, batch_size=1)

sending upsert requests:   0%|          | 0/20 [00:00<?, ?it/s]

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 07 Aug 2024 13:30:54 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '4', 'x-pinecone-request-id': '2503468890380648804', 'x-envoy-upstream-service-time': '2', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 45131 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}


In [109]:
# QUERY THE INDEX
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

query = "Give me list of candidates with part time salary less than 2000"
query_embedding = embed.embed_query(query)
print(query_embedding)

index.query(vector=query_embedding, top_k=3)

# vectorstore.similarity_search(
#     query,
#     k=3
# )

[0.002593112247738291, -0.008139044465362991, 0.008260735721914528, -0.007011604053958566, -0.0508815539635281, 0.02416658994416047, -0.03410238185423518, 0.009491972027725787, -0.03559131696286527, -0.017910192524448806, 0.019413445785439697, 0.02167548475310643, 0.0007869711348843807, -0.009606506070741956, 0.01275618014649395, 0.00775965161202507, 0.015605203756312268, -0.0036257044458082362, 0.007637959889812276, -0.0003241390309672864, -0.0240520559011443, 0.002843654535513649, -0.0029116590071391864, 0.017724075170208787, 0.008010194132631057, 0.007076029220324533, 0.028117999527058114, 0.004849782373930983, -0.013858566119573274, 0.010250757734401629, 0.028203899127997725, -0.018812144853572348, -0.022620385951377258, -0.009678089381965805, -0.031009973868668755, 0.01382277446396135, 0.007824076778391037, 0.032183943152971926, 0.04879131606038563, 0.007466159290949277, 0.02130325004462639, 0.019757046051843176, 0.007652276179528039, -0.010930801053673224, -0.034188279592529756, 

{'matches': [{'id': '00e2c3ab-6eba-11ee-8bff-42010a400007',
              'score': 0.797454298,
              'values': []},
             {'id': '013b246f-6ec0-11ee-8bff-42010a400007',
              'score': 0.79315114,
              'values': []},
             {'id': '01cc73ec-6ec8-11ee-8bff-42010a400007',
              'score': 0.791656315,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [85]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-4-turbo',
    temperature=0.4
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa.run(query)

ImportError: cannot import name 'asyncio_accepts_context' from 'langchain_core.runnables.utils' (/home/aman/Desktop/Mercor/dev/env/lib/python3.10/site-packages/langchain_core/runnables/utils.py)