In [1]:
# =========================================
# Skill Gap Analyzer (Optional ML)
# 07_skillgap_model.ipynb
# Cell 1: Load Feature Dataset
# =========================================

import pandas as pd

df = pd.read_csv("/content/skillgap_features.csv")

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (100, 5)


Unnamed: 0,resume_category,job_required_skills,candidate_skills,missing_skills,readiness_score
0,Data Science,"['microsoft office', 'remote desktop', 'techni...","['sql', 'machine learning', 'deep learning', '...","['microsoft office', 'remote desktop', 'techni...",0.0
1,Data Science,"['microsoft office', 'remote desktop', 'techni...","['machine learning', 'git', 'sklearn', 'aws', ...","['microsoft office', 'remote desktop', 'techni...",0.0
2,Data Science,"['microsoft office', 'remote desktop', 'techni...","['machine learning', 'excel', 'deep learning',...","['microsoft office', 'remote desktop', 'techni...",0.0
3,Data Science,"['microsoft office', 'remote desktop', 'techni...","['machine learning', 'deep learning', 'git', '...","['microsoft office', 'remote desktop', 'techni...",0.0
4,Data Science,"['microsoft office', 'remote desktop', 'techni...","['python', 'java', 'data analysis']","['microsoft office', 'remote desktop', 'techni...",0.0


In [2]:
# =========================================
# Cell 2: Create Readiness Classes
# =========================================

def readiness_label(score):
    if score >= 70:
        return "Job Ready"
    elif score >= 40:
        return "Almost Ready"
    else:
        return "Needs Upskilling"

df["readiness_class"] = df["readiness_score"].apply(readiness_label)

print("Class distribution:")
print(df["readiness_class"].value_counts())


Class distribution:
readiness_class
Needs Upskilling    100
Name: count, dtype: int64


In [3]:
# =========================================
# Cell 3: Feature & Target Split
# =========================================

X = df[["readiness_score"]]   # numeric, safe
y = df["readiness_class"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (100, 1)
y shape: (100,)


In [4]:
# =========================================
# Cell 4: Train-Test Split
# =========================================

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)
print("\ny_train distribution:")
print(y_train.value_counts())

print("\ny_test distribution:")
print(y_test.value_counts())


X_train shape: (80, 1)
X_test shape : (20, 1)

y_train distribution:
readiness_class
Needs Upskilling    80
Name: count, dtype: int64

y_test distribution:
readiness_class
Needs Upskilling    20
Name: count, dtype: int64


In [5]:
# =========================================
# FINAL CELL: Model Training Decision
# =========================================

unique_classes = y.unique()

print("Unique readiness classes found:", unique_classes)

if len(unique_classes) < 2:
    print("\n⚠️ Only one class present in target.")
    print("ML model training skipped to avoid invalid classification.")
    print("\n✔ Skill Gap Analyzer will operate as a RULE-BASED system.")
else:
    print("\nMultiple classes present. ML model training possible.")


Unique readiness classes found: ['Needs Upskilling']

⚠️ Only one class present in target.
ML model training skipped to avoid invalid classification.

✔ Skill Gap Analyzer will operate as a RULE-BASED system.
