In [16]:
import pandas as pd
import re
import spacy
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])  # Keep parser for noun chunks


In [12]:
generic_terms = {
    'large', 'include', 'provide', 'process', 'business', 'collaborate', 'ensure', 
    'maintain', 'develop', 'analyze', 'identify', 'monitor', 'level', 'report', 
    'assess', 'build', 'clean', 'design', 'generate', 'management', 'opportunity', 
    'performance', 'planning', 'strategy', 'advantage', 'knowledge', 'recommendation',
    'datum', 'load', 'etl', 'extract', 'transform', 'analytic', 'scientist', 'data',
    'preprocess', 'availability', 'accessibility', 'pipeline', 'technology', 'big',
    'warehouse', 'programming', 'evaluation', 'parameter', 'finetune', 'training',
    'algorithm', 'model', 'prediction', 'learning', 'machine', 'spark', 'hadoop',
    'deep', 'integration', 'reporting', 'collaborate', 'analyst', 'ensure', 'maintain',
    'shipment', 'track', 'optimize', 'costsave', 'problemsolve', 'logistic', 'transportation',
    'warehousing', 'inventory', 'chain', 'supply', 'level', 'monitor', 'opportunity',
    'include', 'planning', 'identify', 'trend', 'industry', 'market', 'competitive',
    'research', 'offering', 'positioning', 'competitor', 'product'
}

In [3]:
df = pd.read_csv('/home/jax/CVreviewArabian/data/preprocessed/sampled_cleaned.csv')

In [4]:
df['combined_text'] = df['skills'] + ' ' + df['Responsibilities']


In [13]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and token.lemma_ not in generic_terms]
    return tokens  # Return only tokens for frequency counting

In [None]:
job_tokens = defaultdict(list)
tqdm.pandas()
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Collecting tokens per job"):
    job = row['Job Title']
    tokens = preprocess_text(row['combined_text'])
    job_tokens[job].extend(tokens)


Collecting tokens per job: 100%|██████████| 161594/161594 [15:48<00:00, 170.31it/s]


NameError: name 'Counter' is not defined

In [17]:
skill_map = {}
for job, tokens in job_tokens.items():
    word_counts = Counter(tokens)
    top_words = [word for word, _ in word_counts.most_common(40)]
    skill_map[job] = top_words

In [19]:
(skill_map)

{'Speech Therapist': ['speech',
  'therapy',
  'communication',
  'disorder',
  'language',
  'skill',
  'assessment',
  'adult',
  'swallow',
  'speechlanguage',
  'plan',
  'child',
  'development',
  'technique',
  'individual',
  'articulation',
  'treat',
  'rehabilitation',
  'focus',
  'challenge',
  'address',
  'aphasia',
  'difficulty',
  'customize',
  'need',
  'pediatric',
  'individualized',
  'treatment',
  'parent',
  'educator',
  'pathology',
  'diagnose',
  'wide',
  'range',
  'age',
  'group',
  'use',
  'tool',
  'therapeutic',
  'help'],
 'Architectural Designer': ['interior',
  'architect',
  'space',
  'color',
  'collaboration',
  'cad',
  'software',
  'architectural',
  'building',
  'code',
  'theory',
  'material',
  'selection',
  'eg',
  'sketchup',
  'client',
  'budget',
  'focus',
  'select',
  'finish',
  'furnishing',
  'scheme',
  'create',
  'concept',
  'mood',
  'board',
  'contractor',
  'drafting',
  'autocad',
  'modeling',
  'blueprint',
  '

In [20]:
# Preprocessing function for CV
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and token.lemma_ not in generic_terms]
    return tokens

# Function to extract keywords from CV
def extract_cv_keywords(text):
    tokens = preprocess_text(text)
    return list(set(tokens))  # Unique keywords

In [21]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Function to find top 3 job titles and missing keywords
def find_top_jobs_and_missing_keywords(text_cv):
    cv_keywords = set(extract_cv_keywords(text_cv))
    job_similarities = []
    for job_title, job_keywords in skill_map.items():
        job_keywords_set = set(job_keywords)
        similarity = jaccard_similarity(cv_keywords, job_keywords_set)
        missing_keywords = list(job_keywords_set - cv_keywords)
        job_similarities.append((job_title, similarity, missing_keywords))
    
    job_similarities.sort(key=lambda x: x[1], reverse=True)
    top_3_jobs = job_similarities[:3]
    
    result = []
    for job_title, similarity, missing_keywords in top_3_jobs:
        result.append({
            'job_title': job_title,
            'similarity': round(similarity, 3),
            'missing_keywords': missing_keywords
        })
    
    return result

In [30]:
text_cv = """
im machine learning and ai engineering
"""
top_jobs = find_top_jobs_and_missing_keywords(text_cv)

# Print results
for job in top_jobs:
    print(f"Job Title: {job['job_title']}")
    print(f"Similarity: {job['similarity']}")
    print(f"Missing Keywords: {', '.join(job['missing_keywords']) if job['missing_keywords'] else 'None'}")
    print()

Job Title: Process Engineer
Similarity: 0.03
Missing Keywords: lean, industrial, control, reaction, waste, operation, skill, safety, protocol, technique, sigma, laboratory, chemical, focus, system, improve, streamline, environmental, reduce, methodology, workflow, quality, optimization, implement, regulation, productivity, manufacturing, specialize, analysis, compliance

Job Title: Electrical Engineer
Similarity: 0.024
Missing Keywords: power, instrumentation, network, validation, embed, energy, component, circuit, plc, solution, grid, application, implement, testing, transmission, flow, layout, crossfunctional, reliability, efficiency, analysis, pcb, control, team, safety, system, troubleshoot, electrical, hardware, automation, development, electronic, perform, distribution, robotic, regulation, renewable, conduct, industrial

Job Title: Electrical Designer
Similarity: 0.024
Missing Keywords: plan, project, proficiency, manufacturability, software, energy, component, circuit, lighting