In [30]:
# !pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl pacmap datasets langchain-community ragatouille langchain_huggingface

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
# Install required packages (run this cell if not already installed)
!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl datasets pypdf langchain-community langchain-huggingface ragatouille

In [35]:
import numpy as np
import pandas as pd

In [36]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset

In [42]:
import os
import pandas as pd
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from typing import List

##########################
# 1. Load Resume Dataset
##########################

def load_resume_dataset(data_dir: str, csv_path: str = None) -> List[dict]:
    """
    Load resume PDFs with metadata from directory structure and optional CSV.
    
    Args:
        data_dir: Path to directory containing categorized resumes
        csv_path: Optional path to CSV with additional metadata
        
    Returns:
        List of documents with metadata
    """
    # Load CSV data if provided
    resume_df = None
    if csv_path and os.path.exists(csv_path):
        try:
            resume_df = pd.read_csv(csv_path)
            print(f"Loaded metadata for {len(resume_df)} resumes from CSV")
        except Exception as e:
            print(f"Warning: Could not load CSV file. Proceeding without metadata: {e}")

    documents = []
    
    if not os.path.exists(data_dir):
        raise FileNotFoundError(f"Data directory '{data_dir}' not found")
    
    # Process each category subdirectory
    for category in os.listdir(data_dir):
        category_path = os.path.join(data_dir, category)
        if not os.path.isdir(category_path):
            continue
            
        print(f"Loading resumes from category: {category}")
        
        try:
            # Load all PDFs in category directory
            loader = DirectoryLoader(
                category_path, 
                glob="**/*.pdf",
                loader_cls=PyPDFLoader,
                show_progress=True
            )
            docs = loader.load()
            
            # Enhance metadata
            for doc in docs:
                filename = os.path.basename(doc.metadata["source"])
                resume_id = os.path.splitext(filename)[0]
                
                doc.metadata.update({
                    "category": category,
                    "file_name": filename,
                    "id": resume_id,
                    "source_type": "resume"
                })
                
                # Merge with CSV metadata if available
                if resume_df is not None:
                    csv_data = resume_df[resume_df["ID"] == resume_id].to_dict(orient="records")
                    if csv_data:
                        doc.metadata.update(csv_data[0])
            
            documents.extend(docs)
            print(f"  Successfully loaded {len(docs)} resumes")
            
        except Exception as e:
            print(f"  Error loading {category}: {str(e)}")
            continue
            
    print(f"\nTotal resumes loaded: {len(documents)}")
    return documents

# Example usage
DATA_DIR = "/kaggle/input/resume-dataset/data/data"
CSV_PATH = "/kaggle/input/resume-dataset/Resume/Resume.csv"

try:
    all_documents = load_resume_dataset(DATA_DIR, CSV_PATH)
except Exception as e:
    print(f"Failed to load dataset: {e}")
    all_documents = []

Loaded metadata for 2484 resumes from CSV
Loading resumes from category: DESIGNER


100%|██████████| 107/107 [00:24<00:00,  4.43it/s]


  Successfully loaded 202 resumes
Loading resumes from category: BPO


100%|██████████| 22/22 [00:05<00:00,  3.71it/s]


  Successfully loaded 47 resumes
Loading resumes from category: FINANCE


100%|██████████| 118/118 [00:28<00:00,  4.09it/s]


  Successfully loaded 238 resumes
Loading resumes from category: CONSTRUCTION


100%|██████████| 112/112 [00:27<00:00,  4.02it/s]


  Successfully loaded 228 resumes
Loading resumes from category: SALES


100%|██████████| 116/116 [00:24<00:00,  4.81it/s]


  Successfully loaded 205 resumes
Loading resumes from category: AUTOMOBILE


100%|██████████| 36/36 [00:08<00:00,  4.36it/s]


  Successfully loaded 72 resumes
Loading resumes from category: CONSULTANT


100%|██████████| 115/115 [00:30<00:00,  3.82it/s]


  Successfully loaded 236 resumes
Loading resumes from category: CHEF


100%|██████████| 118/118 [00:26<00:00,  4.53it/s]


  Successfully loaded 230 resumes
Loading resumes from category: APPAREL


100%|██████████| 97/97 [00:23<00:00,  4.18it/s]


  Successfully loaded 188 resumes
Loading resumes from category: AGRICULTURE


100%|██████████| 63/63 [00:16<00:00,  3.89it/s]


  Successfully loaded 132 resumes
Loading resumes from category: TEACHER


100%|██████████| 102/102 [00:22<00:00,  4.64it/s]


  Successfully loaded 185 resumes
Loading resumes from category: HR


100%|██████████| 110/110 [00:28<00:00,  3.91it/s]


  Successfully loaded 225 resumes
Loading resumes from category: DIGITAL-MEDIA


100%|██████████| 96/96 [00:22<00:00,  4.22it/s]


  Successfully loaded 180 resumes
Loading resumes from category: ACCOUNTANT


100%|██████████| 118/118 [00:28<00:00,  4.21it/s]


  Successfully loaded 241 resumes
Loading resumes from category: HEALTHCARE


100%|██████████| 115/115 [00:30<00:00,  3.75it/s]


  Successfully loaded 238 resumes
Loading resumes from category: INFORMATION-TECHNOLOGY


100%|██████████| 120/120 [00:32<00:00,  3.67it/s]


  Successfully loaded 247 resumes
Loading resumes from category: ADVOCATE


100%|██████████| 118/118 [00:29<00:00,  3.95it/s]


  Successfully loaded 244 resumes
Loading resumes from category: FITNESS


100%|██████████| 117/117 [00:24<00:00,  4.75it/s]


  Successfully loaded 216 resumes
Loading resumes from category: AVIATION


100%|██████████| 117/117 [00:27<00:00,  4.20it/s]


  Successfully loaded 219 resumes
Loading resumes from category: PUBLIC-RELATIONS


100%|██████████| 111/111 [00:28<00:00,  3.85it/s]


  Successfully loaded 237 resumes
Loading resumes from category: ENGINEERING


100%|██████████| 118/118 [00:28<00:00,  4.09it/s]


  Successfully loaded 227 resumes
Loading resumes from category: BUSINESS-DEVELOPMENT


100%|██████████| 120/120 [00:27<00:00,  4.34it/s]


  Successfully loaded 229 resumes
Loading resumes from category: BANKING


100%|██████████| 115/115 [00:27<00:00,  4.23it/s]


  Successfully loaded 216 resumes
Loading resumes from category: ARTS


100%|██████████| 103/103 [00:23<00:00,  4.36it/s]

  Successfully loaded 199 resumes

Total resumes loaded: 4881





In [43]:
print(dataset[0])  # First document in the dataset

{'ID': 16852973, 'Resume_str': "         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strateg

In [44]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["Resume_str"], metadata={"Category": doc["Category"]}) for doc in tqdm(dataset)
]

  0%|          | 0/2484 [00:00<?, ?it/s]

In [45]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # The number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

In [46]:
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter
print(f"Model's maximum sequence length: {SentenceTransformer('thenlper/gte-small').max_seq_length}")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]

Model's maximum sequence length: 512


  0%|          | 0/19489 [00:00<?, ?it/s]

In [48]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

In [50]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

2025-05-20 09:05:04.230420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747731904.253267     261 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747731904.260276     261 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Chunks:   0%|          | 0/10 [00:00<?, ?it/s]

In [51]:
KNOWLEDGE_VECTOR_DATABASE

<langchain_community.vectorstores.faiss.FAISS at 0x7cd5af59b990>

In [52]:
# retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever()
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_kwargs={"k": 5})

In [53]:
docs = retriever.invoke("""
  "Machine Learning Engineer" OR "Data Scientist" AND (
    ("Python" AND ("Scikit-learn" OR "TensorFlow" OR "PyTorch"))  
    ("SQL" AND ("Spark" OR "Hadoop" OR "ETL"))  
    ("AWS" OR "GCP" OR "Azure" OR "MLOps")  
    ("Tableau" OR "Power BI" OR "data visualization")  
    ("statistical analysis" OR "A/B testing")  
    ("NLP" OR "LLM" OR "GenAI" OR "recommendation systems")  
  )  
  NOT ("intern" OR "student")  
  Years: "3+ years"  
  Location: "Remote" OR "Mumbai" OR "Bangalore" OR "Pune"  
""")

2025-05-20 09:05:57.440724: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747731957.463302     281 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747731957.470239     281 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [54]:
# docs = retriever.invoke("""**Job Duties:**  
# - Develop and deploy **machine learning models** to solve business problems.  
# - Clean, process, and analyze **large datasets** to extract actionable insights.  
# - Collaborate with **cross-functional teams** (engineering, product, marketing) to implement data-driven solutions.  
# - Design and maintain **data pipelines** for efficient data collection and processing.  
# - Conduct **statistical analysis and A/B testing** to measure the impact of business decisions.  
# - Build **predictive models and recommendation systems** to enhance user experience.  
# - Create **data visualizations and dashboards** to communicate findings to stakeholders.  
# - Stay updated with the latest advancements in **AI/ML** and apply best practices.  
# - Automate data workflows to improve **efficiency and scalability**.  
# - Perform other **ad-hoc data-related tasks** as required.  

# **Required Skills & Qualifications:**  
# - **Technical Skills:**  
#   - Strong proficiency in **Python (Pandas, NumPy, Scikit-learn, TensorFlow/PyTorch)**.  
#   - Experience with **SQL** and big data tools (Spark, Hadoop).  
#   - Knowledge of **statistics, machine learning, and deep learning** techniques.  
#   - Familiarity with **data visualization tools** (Tableau, Power BI, Matplotlib/Seaborn).  
#   - Experience with **cloud platforms** (AWS, GCP, or Azure) and MLOps tools.  
# - **Soft Skills:**  
#   - Strong **problem-solving and analytical thinking**.  
#   - Ability to communicate complex findings to **non-technical stakeholders**.  
#   - Team collaboration and **project management** skills.  

# **Experience:**  
# - **3+ years** of hands-on experience in **data science, machine learning, or analytics**.  
# - Proven track record of **deploying ML models in production**.  
# - Experience with **end-to-end data pipelines** (ETL, feature engineering, model deployment).  
# - Prior work in **recommendation systems, NLP, or computer vision** is a plus.  

# **Location:**  
# - **Hybrid/Remote** (with occasional office visits if required).  
# - Preferred locations: **Mumbai, Bangalore, or Pune** (open to remote for strong candidates).  

# **Additional Preferences:**  
# - Experience with **GenAI/LLMs** (e.g., OpenAI, LangChain) is a **big plus**.  
# - Knowledge of **SAP HANA, R, or T-SQL** (for roles involving enterprise data).  
# """)

In [55]:
import pprint

In [56]:
pprint.pp(docs)

[Document(id='4bf4d434-373b-4b8e-993d-a5ce6f661ec5', metadata={'Category': 'AUTOMOBILE', 'start_index': 5994}, page_content="Datastage jobs for small business requirements.          Education and Training        BACHELOR OF TECHNOLOGY   2011     ANNA UNIVERSITY   －   City  ,   State  ,   INDIA            BUSINESS INTELLIGENCE TRAINING   2011     TATA CONSULTANCY SERVICES   －   City  ,   State  ,   INDIA     SEPTEMBER 2011 - DECEMBER 2011\xa0   This course provides an overview that gives business and information technology professionals the confidence to dive right into their business intelligence and data warehousing activities. Hands-On training provided on ETL tools Informatica/Datastage and\xa0data warehousing environment for 90 days.          DATASTAGE TRAINING   2012     GREENS TECHNOLOGIES   －   City  ,   State  ,   INDIA   This course is designed to introduce ETL developers to Datastage Development, Data Warehousing and Data Modeling training's with real-world ETL process implem