In [1]:
# Without tf-idf and without considering weight for duplications
!pip install jsonlines
!pip install pandas
!pip install spacy
!pip install langdetect

import jsonlines
import pandas as pd
import spacy
from langdetect import detect

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Read the positions data from the CSV file
positions_df = pd.read_csv('/kaggle/input/positionscsv/positions.csv')

# Drop the row if the education is empty
positions_df = positions_df.dropna(subset=["education"])
positions_df = positions_df.reset_index(drop=True)

# Drop the row if the job description is not in English
positions_df['language'] = positions_df['description'].apply(detect)
positions_df = positions_df[positions_df['language'] == 'en']
positions_df = positions_df.drop(columns=['language'])

# Save the modified DataFrame to a CSV file
positions_df.to_csv('modified_positions.csv', index=False)

# Load the keywords from the JSON file
keywords = []
with jsonlines.open('/kaggle/input/jz-skill-patternsmaincore/jz_skill_patternsMain.jsonl') as json_file:
    for data in json_file:
        if data['label'] == 'COREPHD':
            patterns = data.get('pattern', [])
            for pattern in patterns:
                keyword = pattern.get('LOWER')
                if keyword:
                    keywords.append(keyword)

# Function to extract keywords from job description
def extract_keywords(job_description, keywords):
    doc = nlp(job_description.lower())
    found_keywords = []
    for keyword in keywords:
        if keyword in doc.text:
            found_keywords.append(keyword)
    return found_keywords

# Create a list to store the job information
job_info = []

# Iterate over the positions
for _, row in positions_df.iterrows():
    job_id = row['_id']
    job_name = row['name']
    job_description = row['description']
    job_education = row['education']
    
    # Skip the job if the description is empty or missing
    if pd.isnull(job_description) or job_description == 'No job description found to import from MindMatch.':
        continue
    
    # Extract the keywords from the job description
    found_keywords = extract_keywords(job_description, keywords)
    
    # Calculate the score
    score = len(found_keywords)
    match = round(score / 31 * 100, 1)
    
    # Write the keywords to a text file
    with open(f'{job_id}_phds.txt', 'w') as txt_file:
        txt_file.write(f'Job Name: {job_name}\n')
        for keyword in found_keywords:
            txt_file.write(keyword + '\n')
    
    # Count the number of words in the job description
    num_words = len(job_description.split())
    
    # Print the match percentage
    print(f"Job ID: {job_id}, Job_name: {job_name}, Match: {match}%, Num Words: {num_words}")

    # Add job information to the list
    job_info.append({'Job ID': job_id, 'Name': job_name, 'Education': job_education, 'Match': match, 'Num Words': num_words, 'Keywords': ', '.join(found_keywords)})

# Create a DataFrame from the job information
job_info_df = pd.DataFrame(job_info)

# Specify the path and filename for the Excel file
excel_file_path = '/kaggle/working/ResultsNoWeightNotfidf.xlsx'

# Write the DataFrame to an Excel file
job_info_df.to_excel(excel_file_path, index=False)


Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l- \ done
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993241 sha256=045ce4bc207b73b084d1c5beac309a7654d737682eaa5ecde067a050dca1143c
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Job ID: 8e7d7a00fb6701, Job_name: rest, Match: 6.5%, Num Words: 191
Job ID: 4b09219bb22d01, Job_name: Lead Scientist - Artificial Intelligence, Match: 3.2%, Num Words: 155
Job ID: 0564017952b801, Job_name: Mechatronics and Software Development Scientist H/F, Match: 3.2%, Num Words: 222
Job ID: c97c41c72de801, Job_name: Lead Software Developer H/F, Match: 6.5%, Num Words: 228
Job ID: 0286a158f05f01, Job_name: Researcher in Advanced Education, Match: 6.5%, Num Words: 247
Job ID: d2840269c20b01, Job_name: ANTIBODY ENGINEERING (SENIOR) SCIENTIST, Match: 3.2%, Num Words: 173
Job ID: a901ad66059901, Job_name: Biostatistician, Consulting Services (CDI) F/M, Match: 9.7%, Num Words: 289
Job ID: 7059e205499201, Job_name: Strategy and Management Consultant (CDI) F/M, Match: 16.1%, Num Words: 460
Job ID: 97a62c38f66801, Job_name: Strategy and Management Consultant (CDI) F/M, Match: 16.1%, Num Words: 460
Job ID: f0789d7c052501, Job_name: Advanced Analytics Specialist (CDI) F/M, Match: 9.7%, Num Wor