In [20]:
import pandas as pd
import re
import spacy
from collections import defaultdict
from tqdm import tqdm
import json
# Load spaCy model (use 'en_core_web_sm' for efficiency; replace with 'en_core_web_lg' if needed)


In [10]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])  # Keep parser for noun chunks


In [3]:
df = pd.read_csv('/home/jax/CVreviewArabian/data/preprocessed/sampled_cleaned.csv')

In [12]:
df['combined_text'] = df['skills'] + ' ' + df['Responsibilities']

# Preprocessing function: clean, tokenize, remove stopwords, lemmatize using spaCy
def preprocess_text(text):
    # Clean text: lowercase and remove special characters
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Process with spaCy
    doc = nlp(text)
    # Tokenize, remove stopwords, and lemmatize
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return doc, tokens

In [None]:
df['Job Title'].tolist()

In [13]:
def extract_skills(text):
    # Preprocess the text
    doc, tokens = preprocess_text(text)
    skills = set()  # Use set to avoid duplicates
    
    # Try to parse skills column directly if it’s comma-separated
    skills_match = re.match(r'^(.*?)(?:\s*Description:|\s*Responsibilities:|$)', text, re.IGNORECASE | re.DOTALL)
    if skills_match:
        skills_text = skills_match.group(1).strip()
        if ',' in skills_text:
            # Split comma-separated skills and clean
            potential_skills = [skill.strip().lower() for skill in skills_text.split(',')]
            for skill in potential_skills:
                # Process each skill with spaCy to lemmatize
                skill_doc = nlp(skill)
                skill_tokens = [token.lemma_ for token in skill_doc if token.is_alpha and not token.is_stop]
                if skill_tokens:
                    skills.add(' '.join(skill_tokens))
    
    # Extract noun phrases as potential skills from the entire text
    for chunk in doc.noun_chunks:
        # Lemmatize and clean noun phrase
        chunk_text = ' '.join(token.lemma_ for token in chunk if token.is_alpha and not token.is_stop)
        if chunk_text and len(chunk_text.split()) <= 3:  # Limit to short phrases (likely skills)
            skills.add(chunk_text)
    
    # Extract single tokens that look like skills (e.g., programming languages)
    for token in tokens:
        if len(token) > 2:  # Avoid short, generic tokens
            skills.add(token)
    
    return list(skills)

# Apply preprocessing and skill extraction with progress bar
tqdm.pandas()
df['extracted_skills'] = df['combined_text'].progress_apply(extract_skills)


100%|██████████| 161594/161594 [40:50<00:00, 65.95it/s] 


In [14]:
df['extracted_skills'].iloc[3]

['upselle',
 'manage',
 'revenue',
 'sale',
 'forecast',
 'ensure',
 'sale account',
 'develop',
 'opportunity',
 'revenue growth',
 'identify',
 'negotiation',
 'closing',
 'management',
 'skill',
 'growth',
 'closing skill',
 'crossselle',
 'account',
 'strategy',
 'account strategy']

In [16]:
# Map skills to each job_title
skill_map = defaultdict(set)
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Mapping skills to job titles"):
    job = row['Job Title']
    for skill in row['extracted_skills']:
        skill_map[job].add(skill)

# Convert sets to lists for the final mapping
skill_map = {job: list(skills) for job, skills in skill_map.items()}

Mapping skills to job titles: 100%|██████████| 161594/161594 [00:08<00:00, 18439.69it/s]


In [17]:
print(skill_map)

{'Speech Therapist': ['communication challenge', 'educator', 'plan', 'treat', 'language', 'language therapy', 'speech disorder aphasia', 'parent', 'therapeutic', 'diagnose', 'adult speech therapy speech language assessment rehabilitation technique swallow disorder communication skill focus speech language therapy adult communication challenge address speech disorder', 'rehabilitation', 'aphasia', 'range', 'planning', 'pediatric speech therapy speech language assessment individualized treatment plan child development communication skill assess treat speech language disorder child provide therapy articulation', 'individualized treatment plan', 'articulation', 'therapy plan', 'pediatric', 'language development', 'development', 'customize', 'disorder', 'challenge', 'therapy', 'focus', 'communication', 'individual need', 'ability', 'adult', 'communication skill', 'communication ability', 'communication skill collaborate parent educator', 'age group', 'wide range', 'use', 'assess', 'collabor

In [19]:
skill_map['Data Analyst']

['modeling',
 'python',
 'hadoop',
 'large',
 'analytical',
 'datum quality tool',
 'eg informatica',
 'experiment',
 'develop',
 'like',
 'skill',
 'data quality assessment',
 'issue',
 'data',
 'provide',
 'power',
 'business',
 'predictive model',
 'establish',
 'datum analysis visualization tool',
 'informatica',
 'clean',
 'datum',
 'analysis',
 'ensure',
 'decisionmake',
 'intelligence',
 'predictive',
 'datum quality',
 'problemsolving',
 'learn',
 'principle',
 'attention',
 'knowledge',
 'data quality standard',
 'big data technology',
 'data quality assessment improvement datum profiling validation knowledge datum quality tool',
 'detail',
 'quality',
 'steward',
 'identify',
 'technique',
 'validation',
 'technology',
 'data experiment',
 'trend',
 'strong',
 'assessment',
 'machine',
 'profiling',
 'governance',
 'algorithm',
 'collaborate',
 'business intelligence concept',
 'improvement datum profiling',
 'concept',
 'power bi sql database query business intelligence conc

In [21]:
json_file = 'skillsModel2.json'
with open(json_file, 'w') as f:
        json.dump(skill_map, f, indent=4)

In [22]:
def extract_cv_skills(text):
    skills = set()
    doc, tokens = preprocess_text(text)
    
    # Try to parse comma-separated skills if present
    if ',' in text:
        potential_skills = [skill.strip().lower() for skill in text.split(',')]
        for skill in potential_skills:
            skill_doc = nlp(skill)
            skill_tokens = [token.lemma_ for token in skill_doc if token.is_alpha and not token.is_stop]
            if skill_tokens:
                skills.add(' '.join(skill_tokens))
    
    # Extract noun phrases as potential skills
    for chunk in doc.noun_chunks:
        chunk_text = ' '.join(token.lemma_ for token in chunk if token.is_alpha and not token.is_stop)
        if chunk_text and len(chunk_text.split()) <= 3:
            skills.add(chunk_text)
    
    # Extract single tokens
    for token in tokens:
        if len(token) > 2:
            skills.add(token)
    
    return list(skills)


In [23]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Function to find top 3 job titles and missing skills
def find_top_jobs_and_missing_skills(text_cv):
    # Extract skills from CV
    cv_skills = set(extract_cv_skills(text_cv))
    
    # Calculate similarity for each job
    job_similarities = []
    for job_title, job_skills in skill_map.items():
        job_skills_set = set(job_skills)
        similarity = jaccard_similarity(cv_skills, job_skills_set)
        # Find missing skills (skills in job but not in CV)
        missing_skills = list(job_skills_set - cv_skills)
        job_similarities.append((job_title, similarity, missing_skills))
    
    # Sort by similarity (descending) and get top 3
    job_similarities.sort(key=lambda x: x[1], reverse=True)
    top_3_jobs = job_similarities[:3]
    
    # Prepare result
    result = []
    for job_title, similarity, missing_skills in top_3_jobs:
        result.append({
            'job_title': job_title,
            'similarity': round(similarity, 3),
            'missing_skills': missing_skills
        })
    
    return result


In [24]:
text_cv = "Python, SQL, data analysis, experience in coding and testing"
top_jobs = find_top_jobs_and_missing_skills(text_cv)


In [25]:
for job in top_jobs:
    print(f"Job Title: {job['job_title']}")
    print(f"Similarity: {job['similarity']}")
    print(f"Missing Skills: {', '.join(job['missing_skills']) if job['missing_skills'] else 'None'}")
    print()

Job Title: Market Analyst
Similarity: 0.069
Missing Skills: product offering, business, market, analyze, product, industry trend, positioning, competitive, competitor, advantage, knowledge, industry, offering, recommendation, assess, research, trend, strategy, competitive advantage, provide

Job Title: Data Scientist
Similarity: 0.06
Missing Skills: evaluation, load process, evaluate, analytic, datum pipeline, prediction preprocess, programming, build, hadoop, clean, load big data technology, reporting, process, warehouse, machine learning, datum accessibility, ensure, deep, datum integration etl, datum integration etl extract, scientist, develop, spark database management datum warehousing design, model, preprocesse, maintain, parameter, preprocess, accessibility, maintain datum pipeline etl extract, load, machine learning model, availability, design, data scientist, database, pipeline, big, transform, analyst, management, technology, prediction, finetune, integration, python programm