In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [21]:
import pinecone
import openai
import mysql.connector
import json

In [22]:
mysql_config = {
    'user': os.environ.get('DB_USER'),
    'password': os.environ.get('DB_PASSWORD'),
    'host': os.environ.get('DB_HOST'),
    'database': os.environ.get('DB_NAME')
}

conn = mysql.connector.connect(**mysql_config)
cursor = conn.cursor()

query = """
    SELECT * FROM `MercorUserProfile`
"""
cursor.execute(query)

# Fetch all results
results = cursor.fetchall()

# Close MySQL connection
cursor.close()
conn.close()

In [4]:
from pinecone import Pinecone
from tqdm import tqdm

from langchain.embeddings.openai import OpenAIEmbeddings

openai_api_key = os.environ.get('OPENAI_API_KEY') or 'OPENAI_API_KEY'
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=openai_api_key
)

api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
INDEX_NAME = "mercor-user-profiles"

index = pc.Index(INDEX_NAME)

dataset = []

def return_as_dict(result):
    return {
        "id": result[0],
        "metadata": {
            "userId": result[0],
            "name": result[1],
            "email": result[2],
            "phone": result[3],
            "fullTimeSalary": result[4],
            "partTimeSalary": result[5],
            "workExperience": result[6],
            "education": result[7],
            "skills": result[8],
        },
        "values": result[9],
    }
    
for result in tqdm(results[5:]):
    result = list(result)
    result.append(embed.embed_query(
        "Skills: " + result[8] + " Education: " + result[7] + " Work Experience: " + result[6] + " Full Time Salary: " + str(result[4]) + " Part Time Salary: " + str(result[5])
    ))
    dataset.append(return_as_dict(result))
    
    
    
import pandas as pd
df = pd.DataFrame(dataset)
df.head()

In [103]:
# UPSERT THE INDEX
index.upsert_from_dataframe(df, batch_size=20)

sending upsert requests:   0%|          | 0/20 [00:00<?, ?it/s]

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 07 Aug 2024 13:30:54 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '4', 'x-pinecone-request-id': '2503468890380648804', 'x-envoy-upstream-service-time': '2', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 45131 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}


In [16]:
# QUERY THE INDEX
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

query = "skills: python, java, c++ education: bachelors work experience: 5 years full time salary: 100000 part time salary: 50000"
query_embedding = embed.embed_query(query)
print(query_embedding)

index.query(vector=query_embedding, top_k=3)

# vectorstore.similarity_search(query, 4)



[0.013950139189388698, -0.019247660282660027, 0.011158753110677364, -0.014968893389067425, -0.022466923888920933, -0.0006914793583167358, -0.024640265684863518, -0.0009313110431734288, -0.03618614351014721, -0.026474023058020716, 0.008156824801597799, 0.00772894773970951, 0.004940957263082162, -0.002127498141010424, 0.0029645743578492973, -0.003153044010518409, 0.0030172099154413554, -0.033333632868633854, 0.00034892328894274696, -0.0006770470017219056, -0.023553594786892226, 0.007301071143482506, -0.009766455673405682, -0.003193794029493947, -0.016041980947380203, 0.008211158160231848, -0.013318511566961434, -0.01098896089928467, -0.005243187734637281, -0.012781968253466329, 0.026012187646986144, 0.0026097083287021207, -0.01874507578397249, -0.0010323374229219198, -0.023268343722740892, -0.01130137864275303, -0.0025299060917930373, 0.008251908179207387, 0.03490930771960188, 0.02696302514837231, 0.02895978259743166, 0.014574975607647988, 0.004988499417548241, -0.0037999527736461776, -0

{'matches': [{'id': '0c89abd8-6f0a-11ee-8bff-42010a400007',
              'score': 0.892633498,
              'values': []},
             {'id': '2a2ce1d3-bd44-4963-b9e1-94df15e57b02',
              'score': 0.890446186,
              'values': []},
             {'id': '3621ac36-bc38-4b92-b2ed-3bf3f07facf9',
              'score': 0.890169919,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-4-turbo',
    temperature=0.4
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa.run(query)

Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.


"I'm sorry, but I don't have access to specific data about candidates or their salaries. If you have a specific dataset or source you would like me to analyze, please provide more details."