In [None]:
import json
import pandas as pd
import ollama
import requests
import ast
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
resumes = pd.read_csv('datasets/Resume.csv')

In [None]:
jobs = pd.read_csv('datasets/dice_com-job_us_sample.csv')

In [None]:
resumes

In [None]:
jobs

In [None]:
resumes['Category'].value_counts()

In [None]:
resumes[resumes['Category']=='INFORMATION-TECHNOLOGY']

In [None]:
def get_response(pompt, model="mistral:instruct"):
    response = ollama.generate(model=model, prompt=pompt)
    return response

In [None]:
def get_prompt(description, doc_type):
    if doc_type not in ['resume', 'job']:
        return "Invalid type"
    
    prompt = f"""Extract the skills from the following {doc_type}, return a comma-separated list. 
Do not include education and certifications.
Do not return the same skill more than one even if it's mentiond multiple times. 
Do not include years of experience or seniority levels, only the skill names.
Return only JSON array of strings.

    {description}"""

    return prompt

In [None]:
def get_skills(document, doc_type):
    prompt = get_prompt(document, doc_type)
    response = get_response(prompt)
    skills = ast.literal_eval(response['response'].strip())
    return skills

In [None]:
def get_embedding(text):
    response = ollama.embeddings(model="nomic-embed-text", prompt=text)
    return response

In [None]:
resumes.iloc[50]['Resume_str']

In [None]:
get_skills(resumes.iloc[50]['Resume_str'], 'resume')

In [None]:
get_skills(jobs.iloc[50]['jobdescription'], 'job')

In [None]:
resumes

In [None]:
resumes['extracted_skills'] = None

In [None]:
for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    prompt = get_prompt(row['Resume_str'], 'resume')
    response = get_response(prompt)
    skills = ast.literal_eval(response['response'].strip())
    resumes.at[index, 'extracted_skills'] = skills

In [None]:
def process_row(row):
    prompt = get_prompt(row['Resume_str'], 'resume')
    response = get_response(prompt)
    skills = ast.literal_eval(response['response'].strip())
    return row.name, skills 

In [None]:
results = {}
with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers as needed
    # Submit all tasks
    future_to_index = {executor.submit(process_row, row): row.name for index, row in resumes.iterrows()}
    
    # Process completed tasks with progress bar
    for future in tqdm.tqdm(as_completed(future_to_index), total=len(resumes)):
        index, skills = future.result()
        resumes.at[index, 'extracted_skills'] = skills

In [None]:
resumes['extracted_skills'] = results

In [None]:
esco = pd.read_csv('datasets/skills_en.csv')

In [None]:
esco['preferredLabel'].tolist()

In [None]:
esco_skills = esco['preferredLabel'].tolist()

In [None]:
esco_skills[:10]

In [None]:
response = ollama.embed(model="nomic-embed-text", input=esco_skills)

In [None]:
esco['embeddings'] = response['embeddings']

In [None]:
esco.to_parquet('processed/esco_skills.parquet', index=False)