In [4]:
import os
import re
import PyPDF2
from google.colab import files, userdata
from openai import OpenAI
from fuzzywuzzy import fuzz
from datetime import datetime
from langdetect import detect
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Initialize OpenAI client with API key from Colab secrets
try:
    api_key = userdata.get('OpenAi')
    if not api_key:
        raise ValueError("API key not found in Colab secrets.")
    client = OpenAI(api_key=api_key)
except Exception as e:
    print(f"Error accessing OpenAI API key: {e}")
    print("Please store your API key in Colab's Secrets as 'OPENAI_API_KEY'.")
    exit()

def read_pdf(file_path):
    """Read text from a PDF file."""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"
            return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def read_txt(file_path):
    """Read text from a text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading text file: {e}")
        return None

def detect_and_transliterate(text):
    """Detect language and transliterate Hindi text to Latin script if needed."""
    try:
        # Detect language
        lang = detect(text)
        print(f"Detected language: {lang}")
        if lang == 'hi':  # Hindi detected
            # Transliterate Hindi (Devanagari) to Latin script
            transliterated_text = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
            return transliterated_text, lang
        return text, lang
    except Exception as e:
        print(f"Language detection/transliteration error: {e}")
        return text, 'en'  # Default to English if detection fails

def extract_resume_details(text, required_skills):
    """Extract resume details with multilingual support (English and Hindi)."""
    details = {
        "name": None,
        "email": None,
        "phone": None,
        "skills": [],
        "education": [],
        "experience": [],
        "all_skills": [],
        "language": None
    }

    # Detect language and transliterate if needed
    processed_text, lang = detect_and_transliterate(text)
    details["language"] = lang

    # Regex for email and phone (language-agnostic)
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    email_match = re.search(email_pattern, processed_text, re.IGNORECASE)
    phone_match = re.search(phone_pattern, processed_text)
    details["email"] = email_match.group(0) if email_match else None
    details["phone"] = phone_match.group(0) if phone_match else None

    # OpenAI API call using gpt-4o-mini
    try:
        prompt = f"""
        Extract the following details from this resume text and return in JSON format. The resume may be in English or transliterated Hindi (originally in Devanagari script, now in Latin script using ITRANS scheme). Transliterate any Hindi terms back to English-like format if needed for consistency.
        - Name (full name of the candidate, typically at the top)
        - Skills (list all skills found, and highlight those matching: {', '.join(required_skills)})
        - Education (list degrees, institutions, and years, e.g., 'B.S. Computer Science, MIT, 2020'; for Hindi, recognize terms like 'B.S.' or 'Bachelor' even if transliterated, e.g., 'bI.es.' or 'baichelor')
        - Work Experience (list job titles, companies, and time periods strictly in the format 'YYYY-YYYY: Job Title, Company', e.g., '2020-2022: Data Analyst, Google', even if the resume uses different separators like 'to' or 'until')
        Resume text (truncated to first 4000 characters): {processed_text[:4000]}
        Output format:
        ```json
        {{
            "name": "string",
            "skills": ["string"],
            "education": ["string"],
            "experience": ["string"]
        }}
        ```
        """
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a resume parser extracting structured data from English or transliterated Hindi resumes."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=1000,
            temperature=0.2
        )
        llm_data = eval(response.choices[0].message.content.strip("```json\n").strip("```"))
        details["name"] = llm_data.get("name", details["name"])
        details["skills"] = [skill for skill in llm_data.get("skills", []) if any(fuzz.ratio(skill.lower(), rs.lower()) > 80 for rs in required_skills)]
        details["all_skills"] = llm_data.get("skills", [])
        details["education"] = llm_data.get("education", [])
        details["experience"] = llm_data.get("experience", [])
    except Exception as e:
        print(f"OpenAI API error: {e}")
        # Fallback: Enhanced regex for multilingual support
        lines = processed_text.split('\n')[:5]
        # Name pattern (adjusted for transliterated Hindi names)
        name_pattern = r'^[A-Z][a-z]+(?: [A-Z]\.)? [A-Z][a-z]+$|^[a-zA-Z]+ [a-zA-Z]+$'  # English or transliterated Hindi names
        for line in lines:
            line = line.strip()
            if re.match(name_pattern, line):
                details["name"] = line
                break

        # Skills section (look for "SKILLS" or "kaushal" in transliterated Hindi)
        skills_section = re.search(r'(?i)(SKILLS|kaushal)\n([\s\S]*?)(?=\n[A-Z]{2,}|\n{2,}|$)', processed_text)
        if skills_section:
            skill_candidates = re.split(r'\n\s*[-•]?\s*|\s*,\s*', skills_section.group(1))
            skill_candidates = [s.strip() for s in skill_candidates if s.strip()]
            details["all_skills"] = skill_candidates
            details["skills"] = [skill for skill in skill_candidates if any(fuzz.ratio(skill.lower(), rs.lower()) > 80 for rs in required_skills)]

        # Education (support transliterated Hindi degrees like "bI.es." or "baichelor")
        education_pattern = r'(?i)(Bachelor\'s|Master\'s|PhD|B\.S\.|M\.S\.|Ph\.D\.|Associate\'s|bI\.es\.|baichelor|maastar|pi. eich. DI.)?\s*(.*?)(?:(\d{4})\s*(?:[-–—]\s*(?:\d{4}|Present))?)?(?=\n[A-Z]{2,}|\n{2,}|$)'
        details["education"] = [f"{degree or ''} {inst.strip()} {year or ''}".strip() for degree, inst, year in re.findall(education_pattern, processed_text, re.MULTILINE) if inst.strip()]

        # Experience (dates are language-agnostic, but job titles/companies may be transliterated)
        experience_pattern = r'(?i)(\d{4})\s*[-–—to]\s*(\d{4}|Present|present)\s*[-:]\s*([^\n]*?)(?=\n[A-Z]{2,}|\n{2,}|$)'
        matches = re.findall(experience_pattern, processed_text, re.IGNORECASE | re.DOTALL)
        details["experience"] = [f"{start}-{end}: {desc.strip()}" for start, end, desc in matches]

    print("Extracted experience entries:", details["experience"])
    return details

def calculate_experience_years(experience_list):
    """Calculate total years of experience from experience entries."""
    total_years = 0
    current_year = datetime.now().year  # 2025 as of May 21, 2025
    for exp in experience_list:
        try:
            period = exp.split(":")[0].strip()
            match = re.match(r'(\d{4})\s*[-–—to]\s*(\d{4}|Present|present)', period, re.IGNORECASE)
            if not match:
                print(f"Skipping invalid experience period: {period}")
                continue
            start, end = match.groups()
            start = int(start.strip())
            if end.lower() in ["present", "current"]:
                end = current_year
            else:
                end = int(end.strip())
            years = end - start
            if years > 0:
                total_years += years
        except Exception as e:
            print(f"Error processing experience entry '{exp}': {e}")
            continue
    return total_years

def check_qualification(details, job_attrs):
    """Compute a confidence score for resume-to-job matching."""
    # Skills match score (0-100%)
    required_skills = job_attrs["skills"]
    matched_skills = details["skills"]
    skills_score = (len(matched_skills) / len(required_skills)) * 100 if required_skills else 0
    skills_pass = skills_score > 50

    # Education match score (0-100%)
    required_degree = job_attrs["qualifications"].lower()
    education_score = 0
    education_pass = False
    for edu in details["education"]:
        edu_lower = edu.lower()
        degree_match = False
        field_match = False

        # Check degree level
        if "bachelor" in required_degree and ("bachelor" in edu_lower or "b.s." in edu_lower or "bI.es." in edu_lower):
            degree_match = True
        elif "master" in required_degree and ("master" in edu_lower or "m.s." in edu_lower or "maastar" in edu_lower):
            degree_match = True
        elif "phd" in required_degree and ("phd" in edu_lower or "ph.d." in edu_lower or "pi. eich. DI." in edu_lower):
            degree_match = True

        # Check field of study
        if "computer science" in required_degree and "computer science" in edu_lower:
            field_match = True
        elif "statistics" in required_degree and "statistics" in edu_lower:
            field_match = True
        elif "mathematics" in required_degree and "mathematics" in edu_lower:
            field_match = True

        # Assign education score
        if degree_match and field_match:
            education_score = 100  # Full match
            education_pass = True
        elif degree_match:
            education_score = 50  # Partial match (right degree level, wrong field)
            education_pass = True  # Still consider it a pass for qualification purposes
        if education_score > 0:
            break

    # Experience match score (0-100%)
    required_exp = job_attrs["experience"]
    min_years, max_years = map(int, required_exp.split("-"))
    candidate_exp_years = calculate_experience_years(details["experience"])
    experience_pass = candidate_exp_years >= min_years
    if candidate_exp_years < min_years:
        experience_score = (candidate_exp_years / min_years) * 100
    else:
        ideal_years = (min_years + max_years) / 2
        if candidate_exp_years <= max_years:
            experience_score = 100
        else:
            experience_score = min(100, 100 * (ideal_years / candidate_exp_years))
    experience_score = max(0, min(100, experience_score))

    # Overall confidence score (weighted)
    confidence_score = (skills_score * 0.4) + (education_score * 0.3) + (experience_score * 0.3)
    qualifies = confidence_score > 50

    # Detailed output
    print("\nConfidence Score Breakdown:")
    print(f"Skills Score: {skills_score:.1f}% ({len(matched_skills)}/{len(required_skills)} matched) - {'Pass' if skills_pass else 'Fail'} (Requires >50%)")
    print(f"Education Score: {education_score:.1f}% - {'Pass' if education_pass else 'Fail'} (Required: {required_degree}, Found: {', '.join(details['education']) if details['education'] else 'None'})")
    print(f"Experience Score: {experience_score:.1f}% ({candidate_exp_years} years) - {'Pass' if experience_pass else 'Fail'} (Requires at least {min_years} years)")
    print(f"Overall Confidence Score: {confidence_score:.1f}% - {'Pass' if qualifies else 'Fail'} (Requires >50%)")

    return qualifies, confidence_score, education_pass, experience_pass

def job_chatbot():
    print("Hi! I want to apply for a job.")
    while True:
        choice = input("Would you like to see available job roles? (yes/no): ").lower()
        if choice == "yes":
            break
        elif choice == "no":
            print("Thank you for your time. Goodbye!")
            return
        else:
            print("Please enter 'yes' or 'no'.")

    print("\nAvailable job roles:")
    print("1. Data Analyst")
    print("2. Developer")
    print("3. Tester")

    while True:
        job_choice = input("\nPlease select a job role (1-3): ")
        if job_choice in ["1", "2", "3"]:
            break
        else:
            print("Invalid choice. Please select 1, 2, or 3.")

    jobs = {
        "1": {
            "role": "Data Analyst",
            "qualifications": "Bachelor's degree in Statistics, Mathematics, or Computer Science",
            "skills": ["Python", "SQL", "Data Visualization", "Statistical Analysis"],
            "experience": "2-4"
        },
        "2": {
            "role": "Developer",
            "qualifications": "Bachelor's degree in Computer Science or related field",
            "skills": ["Python", "Java", "JavaScript", "Git", "Agile"],
            "experience": "3-5"
        },
        "3": {
            "role": "Tester",
            "qualifications": "Bachelor's degree in Computer Science or equivalent",
            "skills": ["Selenium", "JUnit", "Test Automation", "Bug Tracking"],
            "experience": "1-3"
        }
    }

    selected_job = jobs[job_choice]
    print(f"\nDetails for {selected_job['role']}:")
    print(f"Qualifications: {selected_job['qualifications']}")
    print(f"Required Skills: {', '.join(selected_job['skills'])}")
    print(f"Years of Experience: {selected_job['experience']} years")

    while True:
        apply_choice = input(f"\nWould you like to apply for the {selected_job['role']} position? (yes/no): ").lower()
        if apply_choice in ["yes", "no"]:
            break
        else:
            print("Please enter 'yes' or 'no'.")

    if apply_choice == "no":
        print("Thank you for exploring job opportunities with us!")
        return

    print("\nPlease upload your resume (PDF or text file).")
    uploaded = files.upload()

    if uploaded:
        file_name = list(uploaded.keys())[0]
        file_ext = os.path.splitext(file_name)[1].lower()

        with open(file_name, 'wb') as f:
            f.write(uploaded[file_name])

        resume_text = None
        if file_ext == ".pdf":
            resume_text = read_pdf(file_name)
        elif file_ext == ".txt":
            resume_text = read_txt(file_name)
        else:
            print("Error: Please upload a valid PDF or text file.")
            os.remove(file_name)
            return

        if resume_text:
            details = extract_resume_details(resume_text, selected_job['skills'])
            if not details["name"]:
                print("Error: Could not extract a valid name from the resume. Please ensure your name is included at the top.")
                os.remove(file_name)
                return
            if not details["email"]:
                print("Warning: No email address found in the resume.")
            if not details["phone"]:
                print("Warning: No phone number found in the resume.")
            if not details["skills"]:
                print(f"Warning: None of the required skills ({', '.join(selected_job['skills'])}) were found in the resume.")
            if not details["education"]:
                print("Warning: No education details found in the resume.")
            if not details["experience"]:
                print("Warning: No experience details found in the resume.")

            print(f"\nApplication submitted for {selected_job['role']}!")
            print(f"Resume Language: {'Hindi' if details['language'] == 'hi' else 'English'}")
            print(f"Candidate Name: {details['name']}")
            print(f"Email: {details['email'] or 'Not found'}")
            print(f"Phone: {details['phone'] or 'Not found'}")
            print(f"Skills (matched): {', '.join(details['skills']) if details['skills'] else 'None found'}")
            print(f"Skills (all): {', '.join(details['all_skills']) if details['all_skills'] else 'None found'}")
            print(f"Education: {', '.join(details['education']) if details['education'] else 'Not found'}")
            print(f"Experience: {', '.join(details['experience']) if details['experience'] else 'Not found'}")
            print(f"Uploaded file: {file_name}")

            # Check qualification with confidence scoring
            qualifies, confidence_score, education_pass, experience_pass = check_qualification(details, selected_job)
            if qualifies:
                print("\nCongratulations! You have qualified for the role.")
                print("\nCandidate Details:")
                print(f"Name: {details['name']}")
                print(f"Email: {details['email'] or 'Not found'}")
                print(f"Phone: {details['phone'] or 'Not found'}")
                print(f"Skills (matched): {', '.join(details['skills']) if details['skills'] else 'None found'}")
                print(f"Skills (all): {', '.join(details['all_skills']) if details['all_skills'] else 'None found'}")
                print(f"Education: {', '.join(details['education']) if details['education'] else 'Not found'}")
                print(f"Experience: {', '.join(details['experience']) if details['experience'] else 'Not found'}")
                print(f"Confidence Score: {confidence_score:.1f}%")

                # Prompt for contact preference
                while True:
                    contact_preference = input("\nHow would you like us to contact you? (email/phone): ").lower()
                    if contact_preference in ["email", "phone"]:
                        break
                    print("Please enter 'email' or 'phone'.")

                if contact_preference == "email":
                    if details["email"]:
                        print(f"We will contact you via email at {details['email']}.")
                    else:
                        print("We couldn't find an email address in your resume.")
                        alt_contact = input("Would you like to provide an email address now, or should we contact you via phone instead? (provide/phone): ").lower()
                        if alt_contact == "provide":
                            details["email"] = input("Please enter your email address: ")
                            print(f"We will contact you via email at {details['email']}.")
                        elif alt_contact == "phone" and details["phone"]:
                            print(f"We will contact you via phone at {details['phone']} instead.")
                        else:
                            print("We couldn't find a phone number either. Please provide a contact method to proceed.")
                            details["email"] = input("Please enter your email address: ")
                            print(f"We will contact you via email at {details['email']}.")
                else:  # contact_preference == "phone"
                    if details["phone"]:
                        print(f"We will contact you via phone at {details['phone']}.")
                    else:
                        print("We couldn't find a phone number in your resume.")
                        alt_contact = input("Would you like to provide a phone number now, or should we contact you via email instead? (provide/email): ").lower()
                        if alt_contact == "provide":
                            details["phone"] = input("Please enter your phone number: ")
                            print(f"We will contact you via phone at {details['phone']}.")
                        elif alt_contact == "email" and details["email"]:
                            print(f"We will contact you via email at {details['email']} instead.")
                        else:
                            print("We couldn't find an email address either. Please provide a contact method to proceed.")
                            details["phone"] = input("Please enter your phone number: ")
                            print(f"We will contact you via phone at {details['phone']}.")
            else:
                print("\nSorry, you have not qualified for the role.")
                reasons = []
                if skills_score <= 50:
                    reasons.append(f"Skills match ({skills_score:.1f}%) is not above 50%.")
                if not education_pass:
                    reasons.append("Education does not match the required qualifications.")
                if not experience_pass:
                    reasons.append("Experience does not meet the required range.")
                print("Reasons: " + " ".join(reasons))
        else:
            print("Error: Could not extract text from the resume.")
            os.remove(file_name)
            return

        os.remove(file_name)
    else:
        print("No file selected. Application cancelled.")
        return

    print("Thank you for applying! We'll review your application and get back to you.")

# Run the chatbot
job_chatbot()

Hi! I want to apply for a job.
Would you like to see available job roles? (yes/no): yes

Available job roles:
1. Data Analyst
2. Developer
3. Tester

Please select a job role (1-3): 1

Details for Data Analyst:
Qualifications: Bachelor's degree in Statistics, Mathematics, or Computer Science
Required Skills: Python, SQL, Data Visualization, Statistical Analysis
Years of Experience: 2-4 years

Would you like to apply for the Data Analyst position? (yes/no): yes

Please upload your resume (PDF or text file).


Saving data-analyst2 - Template 18.pdf to data-analyst2 - Template 18 (2).pdf
Detected language: en
Extracted experience entries: ['2021-Present: Data Analyst, Resume Worded', '2019-2021: Data Specialist, Polyhire', '2018-2019: Research Assistant, Growthsi', '2017-2018: Database Coordinator, ABC Company', '2016-2017: Business Analyst, XYZ Company', '2014-2015: Analytics Consultant (Internship), ABC']

Application submitted for Data Analyst!
Resume Language: English
Candidate Name: First Last
Email: rst.last@gmail.com
Phone: +44 1234567890
Skills (matched): Data Visualization, Python, SQL
Skills (all): Data Warehousing, Machine Learning, Survey Data Collection, Data Modeling, Data Mining, SQL Server Reporting, Predictive Analytics, Google BigQuery, Data Visualization, Python, Power BI, SQL, Tableau
Education: B.S. Applied Statistics, University of New York, 2014
Experience: 2021-Present: Data Analyst, Resume Worded, 2019-2021: Data Specialist, Polyhire, 2018-2019: Research Assistant, Gr