In [7]:
# =========================================
# Skill Gap Analyzer
# 07_skillgap_cleaning.ipynb
# Cell 1: Load Data & Basic Text Cleaning
# =========================================

import pandas as pd
import re

# Load datasets
job_df = pd.read_csv("monster_com-job_sample.csv")
resume_df = pd.read_csv("UpdatedResumeDataSet.csv")

print("Initial Shapes:")
print("Job data:", job_df.shape)
print("Resume data:", resume_df.shape)

# Keep only required columns
job_df = job_df[["job_title", "job_description"]].dropna()
resume_df = resume_df[["Category", "Resume"]].dropna()

print("\nAfter column selection:")
print("Job data:", job_df.shape)
print("Resume data:", resume_df.shape)

# Basic text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply cleaning
job_df["clean_description"] = job_df["job_description"].apply(clean_text)
resume_df["clean_resume"] = resume_df["Resume"].apply(clean_text)

print("\nSample cleaned job text:\n")
print(job_df["clean_description"].iloc[0][:500])

print("\nSample cleaned resume text:\n")
print(resume_df["clean_resume"].iloc[0][:500])



Initial Shapes:
Job data: (22000, 14)
Resume data: (962, 2)

After column selection:
Job data: (22000, 2)
Resume data: (962, 2)

Sample cleaned job text:

teamsoft is seeing an it support specialist to join our client in madison wi the ideal candidate must have at least years of experience in the field they need to be familiar with a variety of the field s concepts practices and procedures as this position relies on extensive experience and judgment to plan and accomplish goals required skills call tracking software phone based technical support problem documentation and communication remote desktop management tools respond to customer requests ge

Sample cleaned resume text:

skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm na ve bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic m

In [8]:
# =========================================
# Cell 2: Stopwords Removal
# =========================================

import nltk
from nltk.corpus import stopwords

# Download stopwords (first time only)
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# Remove stopwords function
def remove_stopwords(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply stopwords removal
job_df["clean_description_nostop"] = job_df["clean_description"].apply(remove_stopwords)
resume_df["clean_resume_nostop"] = resume_df["clean_resume"].apply(remove_stopwords)

print("Sample job text after stopword removal:\n")
print(job_df["clean_description_nostop"].iloc[0][:500])

print("\nSample resume text after stopword removal:\n")
print(resume_df["clean_resume_nostop"].iloc[0][:500])



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Sample job text after stopword removal:

teamsoft seeing support specialist join client madison wi ideal candidate must least years experience field need familiar variety field concepts practices procedures position relies extensive experience judgment plan accomplish goals required skills call tracking software phone based technical support problem documentation communication remote desktop management tools respond customer requests general understanding landesk microsoft office suitefind teamsoft madison area technology leader consul

Sample resume text after stopword removal:

skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm na bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch

In [9]:
# =========================================
# Cell 3: Skill Keyword Extraction
# =========================================

# Common technical skill vocabulary (expandable later)
skill_keywords = [
    "python", "java", "sql", "javascript", "html", "css",
    "machine learning", "deep learning", "data analysis",
    "pandas", "numpy", "scikit", "sklearn", "tensorflow",
    "keras", "docker", "git", "linux", "aws", "azure",
    "nlp", "computer vision", "tableau", "power bi",
    "excel", "spark", "hadoop"
]

# Function to extract skills from text
def extract_skills(text, skill_list):
    found = set()
    for skill in skill_list:
        if skill in text:
            found.add(skill)
    return list(found)

# Apply extraction
job_df["job_skills"] = job_df["clean_description_nostop"].apply(
    lambda x: extract_skills(x, skill_keywords)
)

resume_df["resume_skills"] = resume_df["clean_resume_nostop"].apply(
    lambda x: extract_skills(x, skill_keywords)
)

print("Sample job extracted skills:")
print(job_df["job_skills"].iloc[0])

print("\nSample resume extracted skills:")
print(resume_df["resume_skills"].iloc[0])


Sample job extracted skills:
[]

Sample resume extracted skills:
['machine learning', 'numpy', 'sql', 'tableau', 'git', 'pandas', 'css', 'java', 'python', 'html', 'deep learning', 'scikit', 'computer vision', 'docker', 'javascript']


In [10]:
# =========================================
# Cell 4: Improved Skill Extraction for Jobs
# =========================================

# Expanded & normalized skill vocabulary
skill_keywords = [
    # programming
    "python", "java", "c++", "c#", "sql", "javascript",
    # data / ml
    "machine learning", "deep learning", "data analysis",
    "data science", "nlp", "computer vision",
    # libraries
    "pandas", "numpy", "scikit", "sklearn",
    # tools
    "git", "docker", "linux", "aws", "azure",
    "tableau", "power bi", "excel",
    # it / support
    "microsoft office", "remote desktop",
    "technical support", "networking", "troubleshooting"
]

def extract_skills_loose(text, skill_list):
    found = set()
    for skill in skill_list:
        # match partial phrases as well
        if any(word in text for word in skill.split()):
            if skill in text:
                found.add(skill)
    return list(found)

# Re-extract job skills
job_df["job_skills"] = job_df["clean_description_nostop"].apply(
    lambda x: extract_skills_loose(x, skill_keywords)
)

print("Improved job extracted skills (sample):")
print(job_df["job_skills"].iloc[0])


Improved job extracted skills (sample):
['microsoft office', 'technical support', 'remote desktop']


In [11]:
# =========================================
# Cell 5: Skill Gap Calculation
# =========================================

# Select one sample job & resume (for demo)
job_required_skills = set(job_df["job_skills"].iloc[0])
candidate_skills = set(resume_df["resume_skills"].iloc[0])

print("Job Required Skills:")
print(job_required_skills)

print("\nCandidate Skills:")
print(candidate_skills)

# Skill gap = skills required by job but missing in candidate
missing_skills = job_required_skills - candidate_skills

# Readiness score
if len(job_required_skills) > 0:
    readiness_score = (
        (len(job_required_skills) - len(missing_skills))
        / len(job_required_skills)
    ) * 100
else:
    readiness_score = 0

print("\n❌ Missing Skills (Skill Gap):")
print(missing_skills)

print(f"\n✅ Skill Readiness Score: {readiness_score:.2f}%")


Job Required Skills:
{'microsoft office', 'technical support', 'remote desktop'}

Candidate Skills:
{'machine learning', 'numpy', 'sql', 'tableau', 'git', 'pandas', 'css', 'java', 'python', 'html', 'deep learning', 'scikit', 'computer vision', 'docker', 'javascript'}

❌ Missing Skills (Skill Gap):
{'microsoft office', 'technical support', 'remote desktop'}

✅ Skill Readiness Score: 0.00%


In [12]:
# =========================================
# FINAL CELL: Save Cleaned Skill Gap Data
# 07_skillgap_cleaning.ipynb
# =========================================

import os

# Create cleaned data directory
os.makedirs("/content/data/cleaned", exist_ok=True)

# Save cleaned job skills data
job_df[["job_title", "job_skills"]].to_csv(
    "/content/data/cleaned/skillgap_job_cleaned.csv",
    index=False
)

# Save cleaned resume skills data
resume_df[["Category", "resume_skills"]].to_csv(
    "/content/data/cleaned/skillgap_resume_cleaned.csv",
    index=False
)

print("✅ Skill Gap cleaned datasets saved successfully!")
print("Saved files:")
print("- /content/data/cleaned/skillgap_job_cleaned.csv")
print("- /content/data/cleaned/skillgap_resume_cleaned.csv")


✅ Skill Gap cleaned datasets saved successfully!
Saved files:
- /content/data/cleaned/skillgap_job_cleaned.csv
- /content/data/cleaned/skillgap_resume_cleaned.csv
