In [15]:
import streamlit as st
import pandas as pd
import json
import io
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
import re
import openai
from pymongo import MongoClient
import PyPDF2
import docx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dataclasses import dataclass
import logging
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.units import inch

In [16]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [18]:
@dataclass
class ProcessedResume:
    """Data class for processed resume information"""
    name: str
    email: str
    phone: str
    location: str
    summary: str
    experience_years: str
    education: List[str]
    technical_skills: List[str]
    soft_skills: List[str]
    work_experience: List[str]
    certifications: List[str]
    keywords: List[str]
    resume_text: str
    processed_at: str

In [19]:
@dataclass
class JobMatch:
    """Data class for job matching results"""
    job_id: str
    title: str
    category: str
    company_type: str
    location: str
    similarity_score: float
    matching_skills: List[str]
    missing_skills: List[str]
    job_summary: str
    salary_range: str
    match_reasons: List[str]


In [20]:
class ResumeProcessor:
    """Handle resume file processing and text extraction"""
    
    @staticmethod
    def extract_text_from_pdf(file) -> str:
        """Extract text from PDF file"""
        try:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            logger.error(f"Error extracting PDF text: {e}")
            return ""
    @staticmethod
    def extract_text_from_docx(file) -> str:
        """Extract text from DOCX file"""
        try:
            doc = docx.Document(file)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            logger.error(f"Error extracting DOCX text: {e}")
            return ""
    
    @staticmethod
    def extract_text_from_txt(file) -> str:
        """Extract text from TXT file"""
        try:
            return file.read().decode('utf-8')
        except Exception as e:
            logger.error(f"Error extracting TXT text: {e}")
            return ""

In [21]:
# Simulate a text file
sample_resume_text = """
John Doe
john.doe@email.com
(123) 456-7890
San Francisco, CA
Summary: Experienced software engineer with 5 years in Python and Java development.
Education: B.S. Computer Science, Stanford University
Technical Skills: Python, Java, SQL, Docker
Soft Skills: Communication, Teamwork
Work Experience: Software Engineer at Tech Corp, Developed web applications
Certifications: AWS Certified Developer
"""

# Test text extraction
from io import BytesIO

# Simulate a TXT file
# txt_file = BytesIO(sample_resume_text.encode('utf-8'))
# resume_text = ResumeProcessor.extract_text_from_txt(txt_file)
# print("Extracted Resume Text:")
# print(resume_text)

# Optionally, test PDF or DOCX extraction with actual files
# Example for PDF (requires a real PDF file):
with open("experiments/Husna_intern.pdf", "rb") as pdf_file:
    resume_text = ResumeProcessor.extract_text_from_pdf(pdf_file)
    print(resume_text)

Fathima Husna
♂¶ap-¶arker-altSri Lanka /envel⌢pehusnasameen016@gmail.com ♂phone-alt750 825 934ὑ7@codenebulax6 /linkedin-infathima-husna/
/githubFathimaHusna
Summary
Highly motivated and detail-oriented AI enthusiast with hands-on experience as an AI Intern, spe-
cializing in developing and evaluating machine learning models for real-world applications. Skilled
in Python, data preprocessing, model training, and libraries such as Scikit-learn, TensorFlow,
and PyTorch. Demonstrated ability to work on NLP, computer vision, and predictive analytics
projects. Passionate about solving complex problems through AI-driven solutions and continu-
ously learning emerging technologies. Seeking to contribute to innovative AI projects in a dynamic
and growth-oriented environment
Education
University of Moratuwa
BSc(Hons) in Information TechnologyJan 2020 – June 2024
◦Coursework: Machine Learning, Artificial Neural Networks, Natural Language Processing, Digital Image
Processing, Big Data, Data Mining
E

In [22]:
class RAGJobMatcher:
    """RAG-based job matching system"""
    
    def __init__(self, openai_api_key: str, mongo_uri: str, database_name: str):
        """Initialize the RAG job matcher"""
        self.client = openai.OpenAI(api_key=openai_api_key)
        self.mongo_client = MongoClient(mongo_uri)
        self.db = self.mongo_client[database_name]
        self.collection = self.db.job_descriptions
        
        # Initialize TF-IDF vectorizer for similarity matching
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 3),
            lowercase=True
        )
        self.job_vectors = None
        self.job_documents = []
        
    def load_and_vectorize_jobs(self):
        """Load jobs from MongoDB and create TF-IDF vectors"""
        try:
            # Fetch all jobs from MongoDB
            jobs = list(self.collection.find({}, {"_id": 0}))
            
            if not jobs:
                st.error("No jobs found in database. Please run Task 2 first.")
                return False
            
            # Create text documents for vectorization
            self.job_documents = []
            for job in jobs:
                # Combine relevant fields for matching
                job_text = f"""
                {job.get('title', '')} {job.get('category', '')} 
                {' '.join(job.get('technical_skills', []))} 
                {' '.join(job.get('soft_skills', []))} 
                {' '.join(job.get('responsibilities', []))} 
                {' '.join(job.get('keywords', []))}
                {job.get('job_summary', '')}
                """.strip()
                
                self.job_documents.append({
                    'job_data': job,
                    'text': job_text
                })
            
            # Create TF-IDF vectors
            job_texts = [doc['text'] for doc in self.job_documents]
            self.job_vectors = self.vectorizer.fit_transform(job_texts)
            
            logger.info(f"Loaded and vectorized {len(jobs)} jobs")
            return True
            
        except Exception as e:
            logger.error(f"Error loading jobs: {e}")
            st.error(f"Error loading jobs from database: {e}")
            return False
    
    def process_resume_with_llm(self, resume_text: str) -> Optional[ProcessedResume]:
        """Process resume using LLM to extract structured information"""
        try:
            prompt = f"""
            Analyze the following resume and extract structured information. 
            Return the information in JSON format with the following exact keys:

            Resume Text:
            {resume_text}

            Extract and return JSON with these keys:
            {{
                "name": "full name of the person",
                "email": "email address",
                "phone": "phone number",
                "location": "city, state or location",
                "summary": "professional summary or objective",
                "experience_years": "total years of experience or estimate",
                "education": ["degree", "university", "certifications"],
                "technical_skills": ["programming languages", "tools", "technologies"],
                "soft_skills": ["communication", "leadership", "teamwork"],
                "work_experience": ["job titles", "companies", "key achievements"],
                "certifications": ["professional certifications", "licenses"],
                "keywords": ["relevant keywords for job matching"]
            }}

            Guidelines:
            - Extract only information that is explicitly mentioned
            - For technical_skills, focus on hard skills, tools, and technologies
            - For keywords, include important terms that would help in job matching
            - If information is not available, use empty array [] or "Not specified"
            - Return only valid JSON, no additional text
            """
            
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are an expert resume parser. Extract structured information and return only valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1500,
                temperature=0.3
            )
            
            # Parse the response
            extracted_text = response.choices[0].message.content.strip()
            
            # Clean JSON response
            start_idx = extracted_text.find('{')
            end_idx = extracted_text.rfind('}')
            if start_idx != -1 and end_idx != -1:
                extracted_text = extracted_text[start_idx:end_idx + 1]
            
            extracted_data = json.loads(extracted_text)
            
            # Create ProcessedResume object
            processed_resume = ProcessedResume(
                name=extracted_data.get('name', 'Not specified'),
                email=extracted_data.get('email', 'Not specified'),
                phone=extracted_data.get('phone', 'Not specified'),
                location=extracted_data.get('location', 'Not specified'),
                summary=extracted_data.get('summary', ''),
                experience_years=extracted_data.get('experience_years', 'Not specified'),
                education=extracted_data.get('education', []),
                technical_skills=extracted_data.get('technical_skills', []),
                soft_skills=extracted_data.get('soft_skills', []),
                work_experience=extracted_data.get('work_experience', []),
                certifications=extracted_data.get('certifications', []),
                keywords=extracted_data.get('keywords', []),
                resume_text=resume_text,
                processed_at=datetime.now().isoformat()
            )
            
            return processed_resume
            
        except Exception as e:
            logger.error(f"Error processing resume with LLM: {e}")
            return None
    
    def find_matching_jobs(self, processed_resume: ProcessedResume, top_k: int = 10) -> List[JobMatch]:
        """Find matching jobs using RAG approach"""
        try:
            if self.job_vectors is None:
                if not self.load_and_vectorize_jobs():
                    return []
            
            # Create resume text for similarity matching
            resume_text = f"""
            {processed_resume.summary}
            {' '.join(processed_resume.technical_skills)}
            {' '.join(processed_resume.soft_skills)}
            {' '.join(processed_resume.work_experience)}
            {' '.join(processed_resume.keywords)}
            """
            
            # Vectorize resume
            resume_vector = self.vectorizer.transform([resume_text])
            
            # Calculate cosine similarity
            similarities = cosine_similarity(resume_vector, self.job_vectors).flatten()
            
            # Get top matches
            top_indices = np.argsort(similarities)[::-1][:top_k]
            
            matches = []
            for idx in top_indices:
                job_data = self.job_documents[idx]['job_data']
                similarity_score = similarities[idx]
                
                # Calculate matching and missing skills
                resume_skills = set([skill.lower() for skill in processed_resume.technical_skills])
                job_skills = set([skill.lower() for skill in job_data.get('technical_skills', [])])
                
                matching_skills = list(resume_skills.intersection(job_skills))
                missing_skills = list(job_skills.difference(resume_skills))
                
                # Generate match reasons
                match_reasons = self._generate_match_reasons(
                    processed_resume, job_data, similarity_score, matching_skills
                )
                
                match = JobMatch(
                    job_id=job_data.get('job_id', ''),
                    title=job_data.get('title', ''),
                    category=job_data.get('category', ''),
                    company_type=job_data.get('company_type', ''),
                    location=job_data.get('location', ''),
                    similarity_score=similarity_score,
                    matching_skills=matching_skills,
                    missing_skills=missing_skills[:5],  # Top 5 missing skills
                    job_summary=job_data.get('job_summary', ''),
                    salary_range=job_data.get('salary_range', 'Not specified'),
                    match_reasons=match_reasons
                )
                
                matches.append(match)
            
            return matches
            
        except Exception as e:
            logger.error(f"Error finding matching jobs: {e}")
            return []
    
    def _generate_match_reasons(self, resume: ProcessedResume, job: Dict, 
                              similarity_score: float, matching_skills: List[str]) -> List[str]:
        """Generate human-readable match reasons"""
        reasons = []
        
        # Similarity score reason
        if similarity_score > 0.3:
            reasons.append(f"High compatibility score ({similarity_score:.2%})")
        
        # Skill matches
        if matching_skills:
            skill_str = ', '.join(matching_skills[:3])
            reasons.append(f"Matching skills: {skill_str}")
        
        # Category match
        resume_keywords = [kw.lower() for kw in resume.keywords]
        job_category = job.get('category', '').lower()
        if any(keyword in job_category for keyword in resume_keywords):
            reasons.append(f"Relevant experience in {job.get('category', '')}")
        
        # Location preference
        if resume.location.lower() in job.get('location', '').lower():
            reasons.append("Location preference match")
        
        return reasons[:4]  # Return top 4 reasons


        

In [25]:
import os
api_key = os.getenv("OPENAI_API_KEY")

In [29]:

# Mock OpenAI API response
class MockOpenAIResponse:
    def __init__(self):
        self.choices = [MockChoice()]

class MockChoice:
    def __init__(self):
        self.message = MockMessage()

class MockMessage:
    def __init__(self):
        self.content = json.dumps({
            "name": "John Doe",
            "email": "john.doe@email.com",
            "phone": "(123) 456-7890",
            "location": "San Francisco, CA",
            "summary": "Experienced software engineer with 5 years in Python and Java development.",
            "experience_years": "5 years",
            "education": ["B.S. Computer Science, Stanford University"],
            "technical_skills": ["Python", "Java", "SQL", "Docker"],
            "soft_skills": ["Communication", "Teamwork"],
            "work_experience": ["Software Engineer at Tech Corp, Developed web applications"],
            "certifications": ["AWS Certified Developer"],
            "keywords": ["software engineer", "Python", "Java", "web development"]
        })

# Mock OpenAI client (correct version)
class MockOpenAI:
    def __init__(self, api_key):
        self.chat = MockChat()

class MockChat:
    def __init__(self):
        self.completions = MockCompletions()

class MockCompletions:
    def create(self, model, messages, max_tokens, temperature):
        return MockOpenAIResponse()

# Mock MongoDB collection
sample_jobs = [
    {
        "job_id": "1",
        "title": "Senior Software Engineer",
        "category": "Software Development",
        "company_type": "Tech Startup",
        "location": "San Francisco, CA",
        "technical_skills": ["Python", "Java", "AWS"],
        "soft_skills": ["Leadership", "Communication"],
        "responsibilities": ["Develop web applications", "Lead team"],
        "keywords": ["software engineer", "Python", "AWS"],
        "job_summary": "Develop and maintain web applications using Python and AWS.",
        "salary_range": "$120,000 - $150,000"
    },
    {
        "job_id": "2",
        "title": "Data Scientist",
        "category": "Data Science",
        "company_type": "Enterprise",
        "location": "New York, NY",
        "technical_skills": ["Python", "R", "TensorFlow"],
        "soft_skills": ["Problem-solving", "Communication"],
        "responsibilities": ["Analyze data", "Build ML models"],
        "keywords": ["data scientist", "Python", "machine learning"],
        "job_summary": "Build and deploy machine learning models.",
        "salary_range": "$100,000 - $130,000"
    }
]

# Mock MongoDB client
class MockMongoCollection:
    def find(self, query, projection):
        return sample_jobs

class MockMongoDB:
    def __init__(self, uri):
        self.db = MockDB()
    
    def __getitem__(self, name):
        return self.db

class MockDB:
    def __init__(self):
        self.job_descriptions = MockMongoCollection()

# Override RAGJobMatcher to use mocks
class TestRAGJobMatcher(RAGJobMatcher):
    def __init__(self):
        self.client = MockOpenAI("dummy_key")
        self.mongo_client = MockMongoDB("dummy_uri")
        self.db = self.mongo_client["recruitment_platform"]
        self.collection = self.db.job_descriptions
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 3),
            lowercase=True
        )
        self.job_vectors = None
        self.job_documents = []

# Sample resume text
sample_resume_text = """
John Doe
john.doe@email.com
(123) 456-7890
San Francisco, CA
Summary: Experienced software engineer with 5 years in Python and Java development.
Education: B.S. Computer Science, Stanford University
Technical Skills: Python, Java, SQL, Docker
Soft Skills: Communication, Teamwork
Work Experience: Software Engineer at Tech Corp, Developed web applications
Certifications: AWS Certified Developer
"""

# Test RAGJobMatcher
matcher = TestRAGJobMatcher()

# Process resume
processed_resume = matcher.process_resume_with_llm(sample_resume_text)
print("Processed Resume:")
print(processed_resume)

# Load and vectorize jobs
matcher.load_and_vectorize_jobs()

# Find matching jobs
job_matches = matcher.find_matching_jobs(processed_resume, top_k=2)
print("\nJob Matches:")
for match in job_matches:
    print(f"Title: {match.title}, Score: {match.similarity_score:.2%}")
    print(f"Matching Skills: {match.matching_skills}")
    print(f"Missing Skills: {match.missing_skills}")
    print(f"Match Reasons: {match.match_reasons}")
    print("---")

INFO:__main__:Loaded and vectorized 2 jobs


Processed Resume:
ProcessedResume(name='John Doe', email='john.doe@email.com', phone='(123) 456-7890', location='San Francisco, CA', summary='Experienced software engineer with 5 years in Python and Java development.', experience_years='5 years', education=['B.S. Computer Science, Stanford University'], technical_skills=['Python', 'Java', 'SQL', 'Docker'], soft_skills=['Communication', 'Teamwork'], work_experience=['Software Engineer at Tech Corp, Developed web applications'], certifications=['AWS Certified Developer'], keywords=['software engineer', 'Python', 'Java', 'web development'], resume_text='\nJohn Doe\njohn.doe@email.com\n(123) 456-7890\nSan Francisco, CA\nSummary: Experienced software engineer with 5 years in Python and Java development.\nEducation: B.S. Computer Science, Stanford University\nTechnical Skills: Python, Java, SQL, Docker\nSoft Skills: Communication, Teamwork\nWork Experience: Software Engineer at Tech Corp, Developed web applications\nCertifications: AWS Certi

In [30]:
class ReportGenerator:
    """Generate PDF and CSV reports"""
    
    @staticmethod
    def generate_pdf_report(matches: List[JobMatch], resume_name: str) -> bytes:
        """Generate PDF report of job matches"""
        buffer = io.BytesIO()
        doc = SimpleDocTemplate(buffer, pagesize=letter)
        styles = getSampleStyleSheet()
        story = []
        
        # Title
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=18,
            spaceAfter=30,
            textColor=colors.darkblue
        )
        story.append(Paragraph(f"Job Recommendations for {resume_name}", title_style))
        story.append(Spacer(1, 20))
        
        # Summary
        summary_text = f"Generated on: {datetime.now().strftime('%B %d, %Y')}<br/>Total Matches: {len(matches)}"
        story.append(Paragraph(summary_text, styles['Normal']))
        story.append(Spacer(1, 20))
        
        # Job matches
        for i, match in enumerate(matches, 1):
            # Job header
            job_header = f"{i}. {match.title} - {match.company_type}"
            story.append(Paragraph(job_header, styles['Heading2']))
            
            # Job details
            details = f"""
            <b>Category:</b> {match.category}<br/>
            <b>Location:</b> {match.location}<br/>
            <b>Compatibility:</b> {match.similarity_score:.1%}<br/>
            <b>Salary Range:</b> {match.salary_range}<br/>
            """
            story.append(Paragraph(details, styles['Normal']))
            
            # Match reasons
            if match.match_reasons:
                reasons_text = "<b>Why this matches:</b><br/>" + "<br/>".join([f"• {reason}" for reason in match.match_reasons])
                story.append(Paragraph(reasons_text, styles['Normal']))
            
            # Skills
            if match.matching_skills:
                skills_text = f"<b>Matching Skills:</b> {', '.join(match.matching_skills[:5])}"
                story.append(Paragraph(skills_text, styles['Normal']))
            
            story.append(Spacer(1, 15))
        
        doc.build(story)
        buffer.seek(0)
        return buffer.getvalue()
    
    @staticmethod
    def generate_csv_report(matches: List[JobMatch]) -> str:
        """Generate CSV report of job matches"""
        data = []
        for match in matches:
            data.append({
                'Job Title': match.title,
                'Category': match.category,
                'Company Type': match.company_type,
                'Location': match.location,
                'Compatibility Score': f"{match.similarity_score:.1%}",
                'Salary Range': match.salary_range,
                'Matching Skills': '; '.join(match.matching_skills),
                'Missing Skills': '; '.join(match.missing_skills),
                'Match Reasons': '; '.join(match.match_reasons),
                'Job Summary': match.job_summary[:200] + '...' if len(match.job_summary) > 200 else match.job_summary
            })
        
        df = pd.DataFrame(data)
        return df.to_csv(index=False)



In [31]:
# Test ReportGenerator
pdf_data = ReportGenerator.generate_pdf_report(job_matches, "John Doe")
with open("job_matches.pdf", "wb") as f:
    f.write(pdf_data)
print("PDF report generated: job_matches.pdf")

csv_data = ReportGenerator.generate_csv_report(job_matches)
with open("job_matches.csv", "w") as f:
    f.write(csv_data)
print("CSV report generated: job_matches.csv")

PDF report generated: job_matches.pdf
CSV report generated: job_matches.csv


In [None]:
# for testing the MongoDB connection
from pymongo import MongoClient
from pymongo.server_api import ServerApi
import certifi
from dotenv import load_dotenv
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
mongo_uri = os.getenv("MONGO_URI")
database_name = os.getenv("DATABASE_NAME", "recruitment_platform")

logger.info(f"OPENAI_API_KEY: {'set' if openai_api_key else 'not set'}")
logger.info(f"MONGO_URI: {'set' if mongo_uri else 'not set'}")
logger.info(f"DATABASE_NAME: {database_name}")

if not openai_api_key or not mongo_uri:
    logger.error("Missing required credentials.")
    exit(1)

client = MongoClient(mongo_uri, server_api=ServerApi('1'), tls=True, tlsCAFile=certifi.where())
try:
    client.admin.command('ping')
    logger.info("Successfully connected to MongoDB!")
    databases = client.list_database_names()
    logger.info(f"Databases: {databases}")
except Exception as e:
    logger.error(f"Error connecting to MongoDB: {e}")
finally:
    client.close()

INFO:__main__:OPENAI_API_KEY: set
INFO:__main__:MONGO_URI: set
INFO:__main__:DATABASE_NAME: recruitment_platform
INFO:__main__:Successfully connected to MongoDB!
INFO:__main__:Databases: ['recruitment_platform', 'admin', 'local']
