In [7]:
import re
import pandas as pd
from collections import Counter
import docx
import skills_extraction as skills_extraction

In [8]:
# Load dataset:
jd_df = pd.read_csv(r'D:/ML_Projects/Job_Reccomendation_System/src/data/jd_structured_data.csv')
jd_df.head()

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD
0,Big Data Specialist,3.3,Astra International,Tebing Tinggi,Bandung,1001 to 5000 employees,2000,Company - Corporation (C Corp),Automotive,Transportation,"Blue Bird Group, Garuda Indonesia",2273,34075,We are looking for a Big Data Specialist skill...
1,Web Developer,4.7,Alfamart,Parepare,Medan,1001 to 5000 employees,1995,Company - Non-profit,Retail,Retail,"Matahari Department Store, Indomaret",359,6156,We need a Web Developer proficient in JavaScri...
2,Business Intelligence Analyst,3.1,Traveloka,Depok,Surabaya,10001+ employees,2011,Company - Government,E-commerce,Technology,"Tokopedia, Bukalapak, GoJek",1230,18397,We need a Business Intelligence Analyst with s...
3,JavaScript Developer,3.7,Bank Central Asia (BCA),Tanjungpinang,Yogyakarta,1001 to 5000 employees,1999,Company - S Corporation (S Corp),Banking,Finance,"Bank Negara Indonesia (BNI), Bank Mandiri, Ban...",84,1220,Hiring a JavaScript Developer proficient in An...
4,Interaction Designer,3.9,Tokopedia,Jakarta,Jakarta,5001 to 10000 employees,2004,Company - Joint Venture,E-commerce,Technology,"GoJek, Bukalapak, Traveloka",4917,73314,We are looking for an Interaction Designer ski...


In [9]:
# Function to extract text from DOCX file
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

In [10]:
def extract_keywords_from_position(position, jd_df):
    position_skills = jd_df[jd_df['Job Title'] == position]['Skills'].values
    if position_skills:
        skills_list = position_skills[0].split(',')
        return [skill.strip() for skill in skills_list]
    return []

In [4]:
def review_cv(file_path, job_position, jd_df):
    try:
        # Extract text from PDF or DOCX resume
        if file_path.endswith('.pdf'):
            resume_text = skills_extraction.extract_text_from_pdf(file_path)
        elif file_path.endswith('.docx'):
            resume_text = extract_text_from_docx(file_path)
        else:
            return None, "Unsupported file format"
        
        # Extract keywords from the job position
        required_keywords = extract_keywords_from_position(job_position, jd_df)
        
        # Extract skills from resume
        resume_keywords = resume_text.split()
        keyword_counts = Counter(resume_keywords)
        keyword_score = sum(keyword_counts[keyword] for keyword in required_keywords if keyword in keyword_counts)
        
        # Standardize section names
        section_mapping = {
            "About": "About me",
            "Education": "Educations",
            "Professional Experience": "Working Experience",
            "Organization Experience": "Organizational Experience",
            "Committee Experience": "Organizational Experience",
            "Projects": "Project",
            "Skill": "Skills",
            "Key Competencies": "Skills",
            "Courses": "Course"
        }
        standardized_sections = {}
        for section in section_mapping:
            if re.search(section, resume_text, re.IGNORECASE):
                standardized_sections[section_mapping[section]] = 1
            else:
                standardized_sections[section_mapping[section]] = 0
        
        # Overall score
        total_score = sum(standardized_sections.values()) + keyword_score
        max_score = len(section_mapping) + len(required_keywords)
        grade = (total_score / max_score) * 100
        
        return standardized_sections, grade
    
    except Exception as e:
        return None, str(e)