In [1]:
# With tf-idf

!pip install jsonlines
!pip install pandas
!pip install spacy
!pip install langdetect

import jsonlines
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect

# Read the positions data from the CSV file
positions_df = pd.read_csv('/kaggle/input/positionscsv/positions.csv')

# Drop the row if the education is empty
positions_df = positions_df.dropna(subset=["education"])
positions_df = positions_df.reset_index(drop=True)

# Drop the row if the job description is not in English
positions_df['language'] = positions_df['description'].apply(detect)
positions_df = positions_df[positions_df['language'] == 'en']
positions_df = positions_df.drop(columns=['language'])

# Save the modified DataFrame to a CSV file
positions_df.to_csv('modified_positions.csv', index=False)

# Load the keywords from the JSON file
keywords = []
with jsonlines.open('/kaggle/input/jz-skill-patternsmaincore/jz_skill_patternsMain.jsonl') as json_file:
    for data in json_file:
        if data['label'] == 'COREPHD':
            patterns = data.get('pattern', [])
            for pattern in patterns:
                keyword = pattern.get('LOWER')
                if keyword:
                    keywords.append(keyword)

# Function to extract keywords from job description using TF-IDF with weight for duplications
def extract_keywords(job_description, keywords):
    vectorizer = TfidfVectorizer(vocabulary=keywords)
    doc = [job_description.lower()]
    vectorized_description = vectorizer.fit_transform(doc)
    feature_names = vectorizer.vocabulary_.keys()
    found_keywords = [keyword for keyword in feature_names if keyword in job_description.lower()]
    return found_keywords

# Create a list to store the job information
job_info = []

# Iterate over the positions
for _, row in positions_df.iterrows():
    job_id = row['_id']
    job_name = row['name']
    job_description = row['description']
    job_education = row['education']
    
    # Skip the job if the description is empty or missing
    if pd.isnull(job_description) or job_description == 'No job description found to import from MindMatch.':
        continue
    
    # Extract the keywords from the job description
    found_keywords = extract_keywords(job_description, keywords)
    
    # Calculate the score
    num_words = len(job_description.split())
    score = len(found_keywords)
    match = round(score / num_words * 100, 1)
    
    # Write the keywords to a text file
    with open(f'{job_id}_phds.txt', 'w') as txt_file:
        txt_file.write(f'Job Name: {job_name}\n')
        for keyword in found_keywords:
            txt_file.write(keyword + '\n')
    
    # Print the match percentage
    print(f"Job ID: {job_id}, Job_name: {job_name}, Match: {match}%, Num Words: {num_words}")

    # Add job information to the list
    job_info.append({'Job ID': job_id, 'Name': job_name, 'Education': job_education, 'Match': match, 'Num Words': num_words, 'Keywords': ', '.join(found_keywords)})

# Create a DataFrame from the job information
job_info_df = pd.DataFrame(job_info)

# Specify the path and filename for the Excel file
excel_file_path = '/kaggle/working/tf_idf.xlsx'

# Write the DataFrame to an Excel file
job_info_df.to_excel(excel_file_path, index=False)


Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0
