In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm


In [2]:
df2 = pd.read_csv('/home/jax/CVreviewArabian/data/preprocessed/sampled_cleaned.csv')
df2.shape

(161594, 18)

In [3]:
df2.columns

Index(['Unnamed: 0', 'Experience', 'Qualifications', 'Salary Range',
       'location', 'Country', 'latitude', 'longitude', 'Work Type',
       'Company Size', 'Preference', 'Job Title', 'Role', 'Job Portal',
       'Job Description', 'Benefits', 'skills', 'Responsibilities'],
      dtype='object')

In [4]:
import pandas as pd
import spacy
import re
from collections import Counter
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [5]:
TESTING_SKILLS = [
    'selenium', 'junit', 'testng', 'postman', 'jmeter', 'cypress', 'appium', 
    'loadrunner', 'qa', 'quality assurance', 'test automation', 'manual testing', 
    'regression testing', 'functional testing', 'performance testing', 'usability testing', 
    'api testing', 'cross-browser testing', 'mobile testing'
]
GENERAL_SKILLS = [
    'python', 'java', 'javascript', 'sql', 'git', 'jenkins', 'docker', 'agile', 
    'scrum', 'problem solving', 'communication', 'jira', 'bugzilla','c++', 
    'software development', 'html', 'css'
]
SKILLS_LIST = TESTING_SKILLS + GENERAL_SKILLS

# Role keywords for filtering relevant jobs
TESTER_KEYWORDS = ['tester', 'qa', 'quality assurance', 'test engineer', 'software tester']


In [6]:
cv_text = """
SUMMARY
Computing and Data Science undergraduate (Expected Graduation: 2026) specializing in Machine Learning and Data Analysis.
Possesses hands-on experience developing predictive models, performing complex data preprocessing, feature engineering,
and visualizing datasets through academic projects and extensive self-study. Proven ability to leverage Python, Scikit-learn,
TensorFlow, and SQL to derive actionable insights and build data-driven solutions. Eager to apply analytical and technical skills
to a challenging Junior Machine Learning Engineer role.
EDUCATION
B.Sc. in Computing and Data Science | Alexandria University, Alexandria, Egypt Expected Graduation: 2026 | GPA: 2.95
●​●​●​●​Completed rigorous coursework covering algorithms, data structures, statistics, database management, and software
engineering principles.
Developed 4+ significant data science and machine learning projects, focusing on data analysis, model implementation,
evaluation, and visualization.
Applied software development best practices, achieving 85%+ project code coverage through comprehensive unit
testing (e.g., using Python's unittest framework).
Gained experience with distributed systems concepts relevant to scalable ML applications.
SELF-STUDY & KEY LEARNINGS
●​●​●​Core Texts: Deep-dived into "Hands-On Machine Learning with Scikit-Learn, Keras & TensorFlow" (Géron) and "Pattern
Recognition and Machine Learning" (Bishop).
Foundational Knowledge: Solidified understanding through study of 10+ resources covering Linear Algebra, Calculus,
Probability & Statistics, and essential Deep Learning concepts.
Online Courses: Completed 12+ courses across platforms like Coursera/Udemy covering Python, Data Science
methodologies, AWS fundamentals, and MLOps principles (mention specific relevant course names if possible, e.g.,
"Machine Learning by Andrew Ng").
PROJECTS
Credit Card Fraud Detection Model (2024)
●​
●​
●​
●​
Developed a Support Vector Machine (SVM) model to predict credit card fraud, achieving 94% accuracy.
Reduced feature engineering time by 20% through optimized data preprocessing with Pandas and NumPy.
Improved model performance by 10% through hyperparameter tuning with GridSearchCV.
Presented model insights with clear visualizations (Matplotlib, Seaborn), reducing report analysis time by 15%.
Bookstore Sales Data Analysis & Visualization | University Project (2024 – 2025)
●​●​Analyzed a dataset of 10,000+ book records and sales transactions using Python (Pandas) and SQLite to uncover
purchasing patterns and customer segments.
Implemented Recency-Frequency-Monetary (RFM) analysis to score and segment customers, identifying high-value
●​●​●​●​patrons and potential churn risks.
Optimized SQL queries and Pandas data manipulation techniques, achieving a 30% reduction in data processing and
analysis time.
Developed interactive data visualizations (bar charts, scatter plots) using Matplotlib and Seaborn to illustrate sales
trends, top-performing genres, and RFM segment distributions.
Created a custom GUI application using Tkinter to present key findings and analytical insights, improving data
accessibility for non-technical users by 20%.
Generated reports that informed potential sales strategies, contributing to a projected 15% increase in targeted
marketing effectiveness based on data insights.
Smart Gate IoT System (Data Handling Focus) | University Project (2023 – 2024)
●​●​●​Contributed to an IoT project involving automated vehicle detection and data collection using sensors and servos.
Implemented data handling logic to process and store sensor readings (1000+ entries per day) reliably using Python and
Firebase cloud storage.
Assisted in integrating mobile camera detection for real-time data input, focusing on data validation and error handling
aspects.
TECHNICAL SKILLS
●​●​●​●​●​●​Machine Learning: Scikit-learn, TensorFlow (Keras), Model Development & Evaluation, Feature Engineering, Data
Preprocessing, Hyperparameter Tuning, Classification, Regression, Clustering Basics, NLP Concepts
Python Libraries: Pandas, NumPy, Matplotlib, Seaborn, Flask
Data Analysis & Visualization: Statistical Analysis, Data Modeling, Power BI, Excel, Google Sheets
Databases: SQL (MySQL, PostgreSQL), NoSQL (MongoDB)
Software Development: Git, GitHub, REST API Development, Unit Testing
Foundations: Statistics & Probability, Linear Algebra, Calculus
LANGUAGES
●​ Arabic: Native
●​ English: Advanced
"""
preprocessed_file = '/home/jax/CVreviewArabian/model/preprocessed_jobs.pkl'

In [7]:
def extract_skills_from_docs(docs, skills_list):
        """
        Extracts skills from a list of spaCy docs.
        """
        skills_per_doc = []
        for doc in docs:
            processed_tokens = [
                token.lemma_
                for token in doc
                if not token.is_stop and not token.is_punct and token.is_alpha
            ]
            text_for_regex = ' '.join(processed_tokens)
            found_skills = set()
            for skill in skills_list:
                pattern = r"\b" + re.escape(skill) + r"\b"
                if re.search(pattern, text_for_regex):
                    found_skills.add(skill)
            skills_per_doc.append(found_skills)
        return skills_per_doc

In [8]:
if os.path.exists(preprocessed_file):
    print(f"Loading preprocessed job data from {preprocessed_file}...")
    df2 = pd.read_pickle(preprocessed_file)
else:
    df2['combined_text'] = df2['Job Description'] + ' ' + df2['skills'] + ' ' + df2['Responsibilities']

        # Batch process body texts with progress
        # Note: For multiprocessing, tqdm on pipe might not work perfectly; consider setting n_process=1 for accurate progress if needed.
        # Alternatively, keep n_process>1 for speed, and add print statements or use multiprocessing-aware progress if advanced setup.
    body_texts = df2['combined_text'].str.lower().tolist()
    print("Processing body texts with spaCy...")
    body_docs = list(tqdm(nlp.pipe(body_texts, batch_size=100, n_process=4), total=len(body_texts), desc="spaCy processing body texts"))  # Progress on generator consumption
    df2['body_skills'] = extract_skills_from_docs(body_docs, SKILLS_LIST)

    # Batch process title texts (shorter, but still batched for consistency)
    title_texts = df2['Job Title'].str.lower().tolist()
    print("Processing title texts with spaCy...")
    title_docs = list(tqdm(nlp.pipe(title_texts, batch_size=100, n_process=4), total=len(title_texts), desc="spaCy processing title texts"))
    df2['title_skills'] = extract_skills_from_docs(title_docs, SKILLS_LIST)

    # 3. Combine skills from title and body with progress
    tqdm.pandas(desc="Combining skills")
    df2['processed_skills'] = df2.progress_apply(lambda row: row['title_skills'] | row['body_skills'], axis=1)

    # Save the preprocessed DataFrame for future runs
    df2.to_pickle(preprocessed_file)
    print(f"Preprocessed job data saved to {preprocessed_file}.")

Loading preprocessed job data from /home/jax/CVreviewArabian/model/preprocessed_jobs.pkl...


In [9]:
def preprocess_and_extract_skills(text, skills_list):
    """
    Preprocesses text and extracts a set of skills from it.
    - text: The raw text to process (from a job description or CV).
    - skills_list: The master list of skills to search for.
    """
    # Create a spaCy doc object
    doc = nlp(text.lower())

    # 1. Lemmatize tokens and remove stop words and punctuation
    processed_tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]

    # 2. Extract skills using regex for all skills (simplified to always use regex)
    found_skills = set()
    text_for_regex = ' '.join(processed_tokens)

    for skill in skills_list:
        pattern = r"\b" + re.escape(skill) + r"\b"
        if re.search(pattern, text_for_regex):
            found_skills.add(skill)

    return found_skills

In [10]:
candidate_skills = preprocess_and_extract_skills(cv_text, SKILLS_LIST)

print("--- Skills extracted from CV ---")
print(candidate_skills)
print("\n" + "="*50 + "\n")

--- Skills extracted from CV ---
{'git', 'sql', 'software development', 'python'}




In [11]:
def calculate_jaccard_similarity(set1, set2):
    """Calculates the Jaccard similarity between two sets."""
    if not set1 and not set2:
        return 0.0

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

# Compute similarity scores (fast vectorized apply on sets) with progress
tqdm.pandas(desc="Calculating similarity scores")
df2['similarity_score'] = df2['processed_skills'].progress_apply(
    lambda job_skills: calculate_jaccard_similarity(candidate_skills, job_skills)
)

ranked_jobs = df2.sort_values(by='similarity_score', ascending=False)

print("--- Top 3 Job Matches ---")
print(ranked_jobs[['Job Title', 'similarity_score']].head(3))
print("\n" + "="*50 + "\n")

Calculating similarity scores: 100%|██████████| 161594/161594 [00:00<00:00, 869973.19it/s]


--- Top 3 Job Matches ---
                      Job Title  similarity_score
1510034  Database Administrator               0.4
1142854  Database Administrator               0.4
575833   Database Administrator               0.4




In [12]:
def calculate_weighted_similarity(cv_skills, job_skills, testing_skills, general_skills):
    """
    Calculates a weighted similarity score, prioritizing testing skills for Software Tester roles.
    Testing skills get 2x weight; general skills get 1x weight.
    """
    if not cv_skills and not job_skills:
        return 0.0

    # Calculate intersections for testing and general skills
    testing_intersection = len(cv_skills.intersection(job_skills & set(testing_skills)))
    general_intersection = len(cv_skills.intersection(job_skills & set(general_skills)))
    
    # Apply weights: 2 for testing skills, 1 for general skills
    weighted_intersection = (2 * testing_intersection) + general_intersection
    
    # Union of all skills
    union = len(cv_skills.union(job_skills))
    
    return weighted_intersection / union if union != 0 else 0.0

# Filter jobs by tester-related keywords in Job Title or Role
df2['is_tester_role'] = df2['Job Title'].str.lower().apply(
    lambda x: any(keyword in x for keyword in TESTER_KEYWORDS)
) | df2['Role'].str.lower().apply(
    lambda x: any(keyword in x for keyword in TESTER_KEYWORDS)
)

# Only process tester-related jobs to reduce computation
tester_jobs = df2[df2['is_tester_role']].copy()

# Compute similarity scores with progress
tqdm.pandas(desc="Calculating similarity scores for tester roles")
tester_jobs['similarity_score'] = tester_jobs['processed_skills'].progress_apply(
    lambda job_skills: calculate_weighted_similarity(candidate_skills, job_skills, TESTING_SKILLS, GENERAL_SKILLS)
)

# Sort by similarity score
ranked_jobs = tester_jobs.sort_values(by='similarity_score', ascending=False)

print("--- Top 3 Job Matches for Software Tester ---")
print(ranked_jobs[['Job Title', 'similarity_score']].head(3))
print("\n" + "="*50 + "\n")

Calculating similarity scores for tester roles: 100%|██████████| 4911/4911 [00:00<00:00, 348178.28it/s]

--- Top 3 Job Matches for Software Tester ---
          Job Title  similarity_score
461143   QA Analyst              0.25
1455722  QA Analyst              0.25
1095929  QA Analyst              0.25







In [13]:
if not ranked_jobs.empty:
    best_match = ranked_jobs.iloc[0]
    best_match_skills = best_match['processed_skills']

    missing_skills = best_match_skills - candidate_skills

    print(f"--- Skill Gap Analysis for the Top Job: '{best_match['Job Title']}' ---")
    print(f"Required Skills for this Job: {best_match_skills}")
    print(f"Candidate's Skills: {candidate_skills}")

    if missing_skills:
        print(f"\nMissing Skills to learn for this role: {missing_skills}")
    else:
        print("\nCongratulations! The candidate possesses all the required skills for this role.")
else:
    print("No matching jobs found.")

--- Skill Gap Analysis for the Top Job: 'QA Analyst' ---
Required Skills for this Job: {'python'}
Candidate's Skills: {'git', 'sql', 'software development', 'python'}

Congratulations! The candidate possesses all the required skills for this role.


In [14]:
df2['Job Title'].value_counts()

Job Title
UX/UI Designer                  4855
Digital Marketing Specialist    2797
Software Engineer               2763
Network Engineer                2439
Software Tester                 2095
                                ... 
QA Engineer                      344
Personal Assistant               344
Procurement Coordinator          342
Key Account Manager              341
Inventory Analyst                334
Name: count, Length: 147, dtype: int64

In [15]:
df2['processed_skills'].iloc[30]

{'communication',
 'critical thinking',
 'data analysis',
 'power bi',
 'python',
 'tableau'}

In [16]:
def convert_sets_to_lists(dictionary):
    return {key: list(value) for key, value in dictionary.items()}

# Function to convert lists back to sets after loading JSON
def convert_lists_to_sets(dictionary):
    return {key: set(value) for key, value in dictionary.items()}

In [17]:
import json
json_file = "skillsModel1.json"
#load json
with open(json_file, 'r') as f:
        job_skills_dict = convert_lists_to_sets(json.load(f))

In [18]:
# Create dictionary mapping Job Title to processed_skills
job_skills_dict = dict(zip(df2['Job Title'], df2['processed_skills']))
print("Created job skills dictionary with", len(job_skills_dict), "unique job titles.")
with open(json_file, 'w') as f:
        json.dump(convert_sets_to_lists(job_skills_dict), f, indent=4)


Created job skills dictionary with 147 unique job titles.


In [19]:
job_skills_dict

{'Speech Therapist': {'communication'},
 'Architectural Designer': set(),
 'Electrical Engineer': set(),
 'Account Manager': {'communication'},
 'Purchasing Agent': set(),
 'Customer Service Manager': {'leadership'},
 'SEM Specialist': set(),
 'Nurse Manager': {'communication'},
 'Legal Counsel': set(),
 'Legal Secretary': set(),
 'Electrical Designer': set(),
 'Physician Assistant': set(),
 'UX/UI Designer': {'adobe xd', 'sketch'},
 'Mechanical Engineer': {'system design'},
 'Procurement Manager': {'communication'},
 'Substance Abuse Counselor': set(),
 'Front-End Developer': {'css', 'html', 'javascript'},
 'Environmental Engineer': set(),
 'Database Administrator': set(),
 'Marketing Analyst': {'communication',
  'data visualization',
  'power bi',
  'python',
  'sql',
  'tableau'},
 'Data Entry Clerk': set(),
 'Research Analyst': {'communication', 'critical thinking'},
 'Quality Assurance Analyst': set(),
 'Civil Engineer': set(),
 'Systems Administrator': {'mysql', 'sql'},
 'Softwa

In [20]:
DATA_SCIENCE_SKILLS = [
    'python', 'r', 'sql', 'pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn',
    'machine learning', 'data analysis', 'data visualization', 'power bi', 'data mining',
    'statistical analysis', 'data modeling', 'linear algebra', 'probability', 'cloud computing',
    'data science', 'mysql'
]
GENERAL_SKILLS = [
    'java', 'c++', 'git', 'agile', 'scrum', 'problem solving', 'communication',
    'software development', 'database', 'excel', 'google sheets'
]
SKILLS_LIST = DATA_SCIENCE_SKILLS + GENERAL_SKILLS

# Role keywords for filtering data science-related jobs
DATA_SCIENCE_KEYWORDS = ['data scientist', 'data analyst', 'machine learning engineer', 'data engineer']

In [21]:
def preprocess_and_extract_skills(text, skills_list):
    """
    Preprocesses text and extracts a set of skills from it.
    """
    doc = nlp(text.lower())
    processed_tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    text_for_regex = ' '.join(processed_tokens)
    found_skills = set()
    for skill in skills_list:
        pattern = r"\b" + re.escape(skill) + r"\b"
        if re.search(pattern, text_for_regex):
            found_skills.add(skill)
    return found_skills

candidate_skills = preprocess_and_extract_skills(cv_text, SKILLS_LIST)

print("--- Skills extracted from CV ---")
print(candidate_skills)
print("\n" + "="*50 + "\n")


--- Skills extracted from CV ---
{'linear algebra', 'mysql', 'matplotlib', 'seaborn', 'data science', 'git', 'statistical analysis', 'probability', 'software development', 'database', 'python', 'power bi', 'excel', 'machine learning', 'numpy', 'sql', 'data analysis'}




In [22]:
def calculate_weighted_similarity(cv_skills, job_skills, data_science_skills, general_skills):
    """
    Calculates a weighted similarity score, prioritizing data science skills.
    """
    if not cv_skills or not job_skills:
        return 0.0
    data_science_intersection = len(cv_skills.intersection(job_skills & set(data_science_skills)))
    general_intersection = len(cv_skills.intersection(job_skills & set(general_skills)))
    weighted_intersection = (2 * data_science_intersection) + general_intersection
    union = len(cv_skills.union(job_skills))
    return weighted_intersection / union if union != 0 else 0.0

# Create a DataFrame from job_skills_dict for similarity scoring
job_similarity = []
for job_title, job_skills in tqdm(job_skills_dict.items(), desc="Calculating similarity scores"):
    # Only consider data science-related jobs
    if any(keyword in job_title.lower() for keyword in DATA_SCIENCE_KEYWORDS):
        similarity = calculate_weighted_similarity(candidate_skills, job_skills, DATA_SCIENCE_SKILLS, GENERAL_SKILLS)
        job_similarity.append({'Job Title': job_title, 'similarity_score': similarity, 'job_skills': job_skills})

# Convert to DataFrame and sort
similarity_df = pd.DataFrame(job_similarity)
ranked_jobs = similarity_df.sort_values(by='similarity_score', ascending=False)

print("--- Top 3 Job Matches for Data Scientist ---")
print(ranked_jobs[['Job Title', 'similarity_score']].head(3))
print("\n" + "="*50 + "\n")


Calculating similarity scores: 100%|██████████| 147/147 [00:00<00:00, 495231.07it/s]

--- Top 3 Job Matches for Data Scientist ---
        Job Title  similarity_score
1  Data Scientist          0.222222
0   Data Engineer          0.117647
2    Data Analyst          0.090909







In [23]:
if not ranked_jobs.empty:
    best_match = ranked_jobs.iloc[0]
    best_match_skills = best_match['job_skills']

    missing_skills = best_match_skills - candidate_skills

    print(f"--- Skill Gap Analysis for the Top Job: '{best_match['Job Title']}' ---")
    print(f"Required Skills for this Job: {best_match_skills}")
    print(f"Candidate's Skills: {candidate_skills}")

    if missing_skills:
        print(f"\nMissing Skills to learn for this role: {missing_skills}")
    else:
        print("\nCongratulations! The candidate possesses all the required skills for this role.")
else:
    print("No matching data science jobs found.")

--- Skill Gap Analysis for the Top Job: 'Data Scientist' ---
Required Skills for this Job: {'machine learning', 'deep learning', 'python'}
Candidate's Skills: {'linear algebra', 'mysql', 'matplotlib', 'seaborn', 'data science', 'git', 'statistical analysis', 'probability', 'software development', 'database', 'python', 'power bi', 'excel', 'machine learning', 'numpy', 'sql', 'data analysis'}

Missing Skills to learn for this role: {'deep learning'}


In [24]:
df3 = pd.read_pickle('/home/jax/CVreviewArabian/model/preprocessed_jobs.pkl')

In [25]:
df3.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'location', 'Country',
       'latitude', 'longitude', 'Work Type', 'Company Size', 'Preference',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'combined_text', 'body_skills',
       'title_skills', 'processed_skills'],
      dtype='object')