In [193]:
import pandas as pd
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
from tidb_vector.integrations import TiDBVectorClient
import pymysql
import ssl
import json

import streamlit as st

from llama_index.core import SimpleDirectoryReader, StorageContext, ServiceContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.tidbvector import TiDBVectorStore


In [199]:
# reader = PdfReader("sample.pdf")
# number_of_pages = len(reader.pages)
# page = reader.pages[0]
# text = page.extract_text()
# print(text)

fileName = "./pdfs/Atharva Jadhav Resume with grades.pdf"

def readPDF(name):

    loader = PyPDFLoader(name)
    pages = loader.load_and_split()
    return pages

document = readPDF(fileName)
print(document)
print(document[0].page_content)

[Document(metadata={'source': './pdfs/Atharva Jadhav Resume with grades.pdf', 'page': 0}, page_content="ATHARVA JADHAV\nPortfolio Website \nGithub\nLinkedIn\nGmail\nWhatsappLINKSTechnical Skills: Python, Pandas,\nSklearn, Nltk, Sqlite, Pymongo, Pyqt6,\nJavaScript, HTML, CSS, ReactJS,\nNext.js, Tailwind CSS, Node.js,\nExpress\nSoft Skills: Problem Solving, Fast\nLearner, Communication, TeamworkSKILLS &\nPROFICIENCIESA dedicated and versatile IT Engineer\nwith a strong foundation in machine\nlearning and data science, complemented\nby expertise in web development and\nsoftware development across mobile and\ndesktop platforms. Proficient in both\nSQL and NoSQL types of databases.\nEager to leverage technical skills and\npassion for emerging technologies.PROFESSIONAL\nSUMMARYSpam Message Detector\nT he Spam Message Detector is ML model for detection\nand filtering of spam messages in textual data.\nLink: Spam Message Detector Demo\nWhatsapp Chat Analyser\nWhatsA pp Chat Analyzer is a tool 

In [123]:
def chunk_data(doc,chunk_size,chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(doc)
    return chunks

#chuncks is list which has objects in document type
chuncks = chunk_data(document,2000,1200)
chuncks

[Document(metadata={'source': './pdfs/Atharva Jadhav Resume with grades.pdf', 'page': 0}, page_content="ATHARVA JADHAV\nPortfolio Website \nGithub\nLinkedIn\nGmail\nWhatsappLINKSTechnical Skills: Python, Pandas,\nSklearn, Nltk, Sqlite, Pymongo, Pyqt6,\nJavaScript, HTML, CSS, ReactJS,\nNext.js, Tailwind CSS, Node.js,\nExpress\nSoft Skills: Problem Solving, Fast\nLearner, Communication, TeamworkSKILLS &\nPROFICIENCIESA dedicated and versatile IT Engineer\nwith a strong foundation in machine\nlearning and data science, complemented\nby expertise in web development and\nsoftware development across mobile and\ndesktop platforms. Proficient in both\nSQL and NoSQL types of databases.\nEager to leverage technical skills and\npassion for emerging technologies.PROFESSIONAL\nSUMMARYSpam Message Detector\nT he Spam Message Detector is ML model for detection\nand filtering of spam messages in textual data.\nLink: Spam Message Detector Demo\nWhatsapp Chat Analyser\nWhatsA pp Chat Analyzer is a tool 

In [200]:
#generate embeddings

model = SentenceTransformer('all-mpnet-base-v2')
vector_dim = (model.get_sentence_embedding_dimension())

def generateEmbeddings(model,chuncks):

    list_embeddings = []

    for item in chuncks:
        embedding = model.encode(item.page_content)
        list_embeddings.append(embedding)

    return list_embeddings

embeddings = generateEmbeddings(model=model, chuncks= document)
embeddings = embeddings[0]

lenght = len(embeddings.tolist())
json_legth = len(json.dumps(embeddings.tolist(),indent=None, separators=(',', ':')))

print(lenght,json_legth)



768 16195


In [203]:
# CREATE TABLE resumes (
#   id INT AUTO_INCREMENT PRIMARY KEY,
#   name VARCHAR(255),
#   text TEXT,
#   embedding VECTOR(768) COMMENT "hnsw(distance=cosine)"
# );

vector_store = TiDBVectorClient(
   # The table which will store the vector data.
   table_name='resumes',
   # The connection string to the TiDB cluster.
   connection_string=os.environ.get('TIDB_DATABASE_URL'),
   # The dimension of the vector generated by the embedding model.
   vector_dimension=vector_dim,
   # Determine whether to recreate the table if it already exists.
   drop_existing_table=False,
   distance_strategy="cosine",
)


In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=model)
index = VectorStoreIndex.from_documents(storage_context=storage_context,service_context=service_context)

In [201]:
def insert_resume(name, text, embedding):

    return vector_store.execute("INSERT INTO resumes (name, text, embedding) VALUES (:name, :text, :embedding)", {"name": name, "text": text, "embedding": embedding})


In [204]:
insert_resume(name="taha",text=document[0].page_content,embedding= json.dumps(embeddings.tolist()))


{'success': True, 'result': 1, 'error': None}

In [209]:
query = "PINT OS - MODEL OPERATING SY STEMJ an 2016"
query_embedding = model.encode(query)
query_embedding = json.dumps(query_embedding.tolist())
result = vector_store.execute("SELECT * FROM resumes ORDER BY Vec_Cosine_Distance(embedding, :query_embedding) LIMIT 1", {"query_embedding":query_embedding})
result

{'success': True,
 'result': [(1, 'Raj', "F I R S T  L A S T\nNew Y ork, NY | P: +44 123456789 | first.last@resumew orded.com\nE D U C A T I O N\nRESUME W ORDED UNIVERSITY\nBoston, MA\nBac he ... (3076 characters truncated) ...  R esume W orded examinations\nA w ards:\nR W’ s\nT op 30 Under 30 (2011); W on R W’ s nationwide\ncase competition out of  500+ par ticipants (2013)", '[0.051488753,0.047326837,-0.028571593,0.018815191,0.036526896,-0.027625812,0.040093612,0.0024160955,-0.037976105,-0.0667891,0.045880403,-0.0078414995 ... (9272 characters truncated) ... 000043208096,0.022985335,-0.0006926676,0.020845126,0.02603537,0.014882525,0.001324786,0.03557316,-0.016727772,0.0071519166,-0.052198295,-0.027697802]')],
 'error': None}

In [212]:
name  = "Raj"
vector_store.execute("DELETE FROM resumes WHERE name= :name",{"name":name})

{'success': True, 'result': 1, 'error': None}