<a href="https://colab.research.google.com/github/Asal2/GenAI/blob/FAISS-branch/RAG_Pipeline_for_Job_Data_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing various Packeges**

The name of packages are:

*   Langchain and Langgraph Packages
*   Faiss for vector store
*   Fastapi Uvicorn for Api calls








In [None]:
!pip install -qU langchain-openai langchain langgraph faiss-cpu fastapi uvicorn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.7/216.7 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Importing Libraries**

In [None]:
from google.colab import drive
import os
import pickle
import numpy as np
from google.colab import userdata
import pandas as pd
import re
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import faiss
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List

# **Mounting the drive and setting OpenAi key**

In [None]:

drive.mount('/content/drive')
openai_api_key = userdata.get('OPENAI_API_KEY')

Mounted at /content/drive


In [None]:
llm_provider = "openai"

In [None]:
def get_llm(model=None, temperature=0):
  if llm_provider == "openai":
    return ChatOpenAI(
        api_key=openai_api_key,
        model=model or "gpt-4.1-nano",
        temperature=temperature
      )


# Testing if the LLM model works

In [None]:
response = get_llm().invoke("what is AI")
response.content

'AI, or Artificial Intelligence, refers to the development of computer systems and software that can perform tasks typically requiring human intelligence. These tasks include learning from data (machine learning), understanding natural language (natural language processing), recognizing images or speech, problem-solving, and decision-making. AI aims to create machines that can simulate human cognitive functions, enabling automation and intelligent behavior across various applications such as virtual assistants, autonomous vehicles, medical diagnosis, and more.'

# **Defining the file path where embeddings will be stored**

In [None]:
StoreEmbeddingsPath = "/content/drive/MyDrive/dataset/embeddings"
os.makedirs(StoreEmbeddingsPath, exist_ok=True)
EmbedFile = os.path.join(StoreEmbeddingsPath, "job_embeddings.pkl")

>  Cleaning the text by removing html tag and making the text consistant by lowering cases



In [None]:
def clean_text(text):
  if pd.isna(text):
    return ""
  text = re.sub(r"<.*?>", " ", text)
  text = text.lower()
  text = re.sub(r"\s+", " ", text).strip()
  return text

> Checking if the embedding file already exists. If the file already exits the stored data will be loaded. if there is no embedding files it will create new one. This will save cost of rerunning the entire cotent and avoids repeatly embedding



In [None]:
if os.path.exists(EmbedFile):
  print("---Loading the pre-comuted chucks and embedding files---")
  with open(EmbedFile, "rb") as f:
    saved_data = pickle.load(f)
    all_chunks = saved_data["chunks"]
    embeddings = saved_data["embeddings"]
  print(f"Loaded {len(all_chunks)} chunks and {len(embeddings)} embeddings.")
else:
  print("---No precomputed file found. Generating new chunks and embeddings---")

  df = pd.read_csv("/content/drive/MyDrive/dataset/LF_Jobs.csv")
  df["cleaned description"] = df.iloc[:,-1].apply(clean_text)

  r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=25,
    separators=["\n\n", "\n", " ", ""]
  )

  all_chunks = []
  for _, row in df.iterrows():

    job_id = row["ID"]
    title = row["Job Title"]
    company = row["Company Name"]
    location = row["Job Location"]
    level = row["Job Level"]

    chunks = r_splitter.split_text(row["cleaned description"])

    for i, chunk in enumerate(chunks):
      all_chunks.append({
        "id": f"{job_id}_{i}",
        "job_id": job_id,
        "title": title,
        "company": company,
        "location": location,
        "level": level,
        "text": chunk
      })

  print(f"total chunks: {len(all_chunks)} chunks")

  # Inititalizing OpenAI embedding model

  embedding_model = OpenAIEmbeddings(
      api_key=openai_api_key,
      model="text-embedding-3-small"  # or "text-embedding-3-large"
  )

  #Generating embeddings

  texts = [c["text"] for c in all_chunks]
  embeddings = embedding_model.embed_documents(texts)

  #Save the generated chunks and embedding into a. file

  print(f"--Sucessfully generated {len(all_chunks)} chunks and {len(embeddings)} embeddings--")
  with open(EmbedFile, "wb") as f:
    saved_data = {"chunks": all_chunks, "embeddings": embeddings}
    pickle.dump(saved_data, f)
  print("Generated chunks and embeddings saved to file.")





---No precomputed file found. Generating new chunks and embeddings---
total chunks: 11543 chunks
--Sucessfully generated11543 chunks and 11543 embeddings--
Generated chunks and embeddings saved to file.


In [None]:
import pprint

with open(EmbedFile, "rb") as f:
    saved_data = pickle.load(f)
print("--here is the print out--")

print("\n--- Inspecting the Data ---")
print(f"Total number of chunks: {len(saved_data['chunks'])}")
print(f"Total number of embeddings: {len(saved_data['embeddings'])}")
print("\nFirst chunk's metadata:")
pprint.pprint(saved_data['chunks'][1])

--here is the print out--

--- Inspecting the Data ---
Total number of chunks: 11543
Total number of embeddings: 11543

First chunk's metadata:
{'company': 'Merrill',
 'id': 'LF0001_1',
 'job_id': 'LF0001',
 'level': 'Mid Level',
 'location': 'New York, NY',
 'text': 'physical, emotional, and financial wellness, recognizing and '
         'rewarding performance, and how we make an impact in the communities '
         'we serve. bank of america is committed to an in-office culture with '
         'specific requirements for office-based attendance and which allows '
         'for an appropriate level of flexibility for our teammates and '
         'businesses based on role-specific considerations. at bank of '
         'america, you can build a successful career with opportunities to '
         'learn, grow, and make an impact. join us! job',
 'title': 'DIR, Equities Quant'}


> Building Vector Store. For this vector store we will use FAISS



In [None]:
embedding_matrix = np.array(embeddings, dtype="float32")

def normalize(vecs):
  norms = np.linalg.norm(vecs, axis=1, keepdims=True)
  return vecs / norms

normalized_embeddings = normalize(embeddings) #or embedding_matrix



# 3. Create FAISS index
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(normalized_embeddings)
print(f"FAISS index has {index.ntotal} vectors")



FAISS index has 11543 vectors




> Building a Retriever that take a query and return the most relevant result



In [None]:
def get_openai_embedding(text):
  """Single query embedding"""
  return np.array(embedding_model.embed_query(text), dtype="float32")


def retrieve_relevant_chunks(query, index, chunks_list, k=5):
  """
  Retrieves the top-k most relevant chunks from the FAISS index.

  Args:
    query (str): The user's search query.
    index (faiss.Index): The FAISS index of your embeddings.
    chunks_list (list): The original list of chunk dictionaries.
    k (int): The number of chunks to retrieve.

  Returns:
    list: A list of dictionaries, each containing a retrieved chunk and its similarity score.
  """
  # 1. Embed and normalize the user's query
  query_vec = get_openai_embedding(query).reshape(1, -1)
  query_vec = normalize(query_vec)

  # 2. Search the FAISS index
  # D -> distances (similarity scores), I -> indices of the vectors
  distances, indices = index.search(query_vec, k)

  # 3. Prepare and return the results
  retrieved_chunks = []
  for i, idx in enumerate(indices[0]):
    if idx != -1: # FAISS returns -1 if there are not enough results
      retrieved_chunks.append({
          "score": float(distances[0][i]),
          "chunk": chunks_list[idx] # Get the original chunk dictionary
      })

  return retrieved_chunks

# --- Example of how to use your new retriever ---
query = "remote software engineer jobs that mention Python" #Senior Python Data Engineer
top_results = retrieve_relevant_chunks(query, index, all_chunks, k=5)

print(f"--- Top {len(top_results)} results for query: '{query}' ---")
for result in top_results:
    print(f"\nScore: {result['score']:.4f}")
    print(f"Job ID: {result['chunk']['job_id']}")
    print(f"Title: {result['chunk']['title']}")
    print("--- Chunk Text ---")
    print(result['chunk']['text'])

--- Top 5 results for query: 'remote software engineer jobs that mention Python' ---

Score: 0.5148
Job ID: LF0192
Title: Senior Python Data Engineer
--- Chunk Text ---
we seek a skilled python data engineer to join our team and help us build and maintain our data pipelines. as a python data engineer, you will be responsible for designing, developing, and testing python-based data pipelines and performing data cleansing, transformation and quality assurance. unlock the potential of remote work in uzbekistan, giving you the flexibility to work from home or access our office in tashkent. #li-dni #python-vacancies-uz #top-vacancies-10-uz #top-vacancies-10-uz-dec

Score: 0.5050
Job ID: LF0036
Title: AI Researcher, 2025 Graduate U.S.
--- Chunk Text ---
country of employment at the time of hire and maintain ongoing work authorization during employment. desired skills and attributes demonstrate proficiency in python programming and ability to write clean, efficient and well-documented code. h



> Building a RAG Chain which combines query + retrieved chunks to generate enriched responses.




In [None]:
llm = get_llm(model="gpt-4.1-nano") # Using a faster, modern model like gpt-4o is often a good choice


# 1. Create a function to format the retrieved documents
def format_docs(docs):
  return "\n\n---\n\n".join([d["chunk"]["text"] for d in docs])




# 2. Create a prompt template
# This instructs the LLM on how to use the retrieved context
prompt_template = """
Answer the user's question based only on the following context.
If there is the answer to user question then provide information about the question, Provide the title, location, company, level of the job and salary if available.
If the context doesn't contain enough information, state that you can't find a relevant answer.
Do not make up information.

Context:
{context}

Question:
{question}

Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt_template)



# 3. Build the RAG Chain
# This chains together the retrieval, document formatting, prompt, and LLM
rag_chain = (
    # This part of the chain takes the user's question
    {"context": lambda x: format_docs(retrieve_relevant_chunks(x["question"], index, all_chunks)), "question": lambda x: x["question"]}
    | prompt
    | llm
    | StrOutputParser()
)

# 4. Invoke the chain with a query
query = "Are there any software engineer jobs that mention Python and AWS?"
response = rag_chain.invoke({"question": query})

print("--- RAG Response ---")
print(response)

# # Example with another query
# query_2 = "What are the responsibilities for a data scientist at Quantum Solutions?"
# response_2 = rag_chain.invoke({"question": query_2})

# print("\n--- RAG Response 2 ---")
# print(response_2)

--- RAG Response ---
Yes, there are software engineer jobs that mention Python and AWS. The context indicates roles requiring solid experience in AWS services such as S3, EC2, Lambda, and IAM, as well as proficiency in writing object-oriented and/or functional programming code in Python (e.g., numpy, pandas, scipy, scikit-learn). The roles also involve containerizing and deploying code in AWS, automation and scripting tools, and knowledge of security standards. Specific details about the title, location, company, level of the job, and salary are not provided in the context.


# API Endpoints

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List

In [None]:
app = FastAPI()

#Request Body

class QueryRequest(BaseModel):
  query: str
  k: int = 5

@app.post("/api/query")
def query_jobs(request: QueryRequest):
  results = retrieve_faiss_openai(request.query, index, all_chunks, k=request.k)
  return results