<a href="https://www.kaggle.com/code/baharehsamadi/text-classification-jobdescription?scriptVersionId=132561300" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import json
import pandas as pd
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Read the positions data from the CSV file
positions_df = pd.read_csv('/kaggle/input/positionscsv/positions.csv')

# Load the keywords from the JSON file
keywords = []
with open('/kaggle/input/jz-skill-patternsfren/jz_skill_patternsFrEn.jsonl') as json_file:
    for line in json_file:
        data = json.loads(line)
        if data['label'] == 'COREPHD':
            patterns = data.get('pattern', [])
            for pattern in patterns:
                keyword = pattern.get('LOWER')
                if keyword:
                    keywords.append(keyword)

# Function to extract keywords from job description
def extract_keywords(job_description, keywords):
    doc = nlp(job_description.lower())
    found_keywords = []
    for keyword in keywords:
        if keyword in doc.text:
            found_keywords.append(keyword)
    return found_keywords

# Create a list to store the job information
job_info = []

# Iterate over the positions
for _, row in positions_df.iterrows():
    job_id = row['_id']
    job_name = row['name']
    job_description = row['description']
    job_education = row['education']
    
    # Skip the job if the description is empty or missing
    if pd.isnull(job_description) or job_description == 'No job description found to import from MindMatch.':
        continue
    
    # Extract the keywords from the job description
    found_keywords = extract_keywords(job_description, keywords)
    
    # Calculate the score
    score = len(found_keywords)
    match = round(score / 31 * 100, 1)
    
    # Write the keywords to a text file
    with open(f'{job_id}_phds.txt', 'w') as txt_file:
        txt_file.write(f'Job Name: {job_name}\n')
        for keyword in found_keywords:
            txt_file.write(keyword + '\n')
    
    # Print the match percentage
    print(f"Job ID: {job_id}, Job_name: {job_name}, Match: {match}%")

    # Add job information to the list
    job_info.append({'Job ID': job_id, 'Name': job_name, 'Education': job_education,'Match': match})

# Create a DataFrame from the job information
job_info_df = pd.DataFrame(job_info)

# Specify the path and filename for the Excel file
excel_file_path = '/kaggle/working/job_info.xlsx'

# Write the DataFrame to an Excel file
job_info_df.to_excel('job_infoFREN.xlsx', index=False)



Job ID: 336f328467f101, Job_name: test, Match: 0.0%
Job ID: 8e7d7a00fb6701, Job_name: rest, Match: 12.9%
Job ID: 9b791cefe43401, Job_name: Ingénieur de production – batteries H/F, Match: 41.9%
Job ID: 2076aad842aa01, Job_name: Coordonateur·trice de Recherche en Neuroimagerie, Match: 38.7%
Job ID: 4debb849b4d801, Job_name: Deep Learning Researcher, Match: 12.9%
Job ID: 0b11abd640cb01, Job_name: Senior Machine Learning Scientist H/F (CDI), Match: 19.4%
Job ID: db31d6ba76dd01, Job_name: Chercheur·euse en Machine Learning pour l'image H/F (CDI), Match: 12.9%
Job ID: 4b09219bb22d01, Job_name: Lead Scientist - Artificial Intelligence, Match: 6.5%
Job ID: ad3da83a8d0701, Job_name: Ingénieur Deep Learning pour l'image H/F, Match: 29.0%
Job ID: 2365523ef82901, Job_name: Expert·e en agrégation de données (CDI) H/F, Match: 12.9%
Job ID: c6e8472f964401, Job_name: Développeur·euse Full Stack (CDI) H/F, Match: 19.4%
Job ID: 6faf3fedf54201, Job_name: Chef·fe de projet Microsoft BI H/F (CDI), Match: 6