In [34]:
import pandas as pd
import re
import string
import spacy
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [35]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")


In [36]:
# ---------- Parameters ----------
DATASET_PATH = "resume_data_IT_5000_updated.csv"  # Change for Non-IT
SKILL_LIST = ['python', 'java', 'sql', 'excel', 'tableau', 'html', 'css', 'communication', 'teamwork']
COURSE_MODEL_NAME = "IT_course_model.pkl"
CERT_MODEL_NAME = "IT_cert_model.pkl"
TFIDF_NAME = "IT_coursecert_tfidf.pkl"

In [37]:
# ---------- Text Preprocessing ----------
def clean_text(text):
    text = re.sub(r"<[^>]+>", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

def extract_skills(text, skill_list):
    return [skill for skill in skill_list if skill in text]

def get_missing_skills(resume_skills, required_skills_str):
    required_skills = [skill.strip().lower() for skill in required_skills_str.split(',')]
    return list(set(required_skills) - set(resume_skills))

In [38]:
# ---------- Load Data ----------
df = pd.read_csv(DATASET_PATH)

df["combined"] = df["Keywords"].fillna('') + " " + df["Technologies"].fillna('')
df["cleaned"] = df["combined"].apply(clean_text).apply(lemmatize_text)
df["skills_extracted"] = df["cleaned"].apply(lambda x: extract_skills(x, SKILL_LIST))

In [39]:
# ---------- Compute Missing Skills ----------
df["missing_skills"] = df.apply(lambda row: get_missing_skills(row["skills_extracted"], row["Skills"]), axis=1)
df["missing_skills_str"] = df["missing_skills"].apply(lambda x: " ".join(x))

In [40]:
# ---------- TF-IDF Feature Extraction ----------
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["missing_skills_str"])

In [41]:
# ---------- Courses Model ----------
y_course = df["Courses"]
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_course, test_size=0.2, random_state=42)

course_model = LogisticRegression(max_iter=200)
course_model.fit(X_train_c, y_train_c)
y_pred_c = course_model.predict(X_test_c)
print("Course Recommender Evaluation:\n", classification_report(y_test_c, y_pred_c))

Course Recommender Evaluation:
                                                 precision    recall  f1-score   support

                                  AWS Training       0.19      0.13      0.15       174
              AWS Training, Azure Fundamentals       0.21      0.30      0.24       193
      AWS Training, Google Cloud Certification       0.12      0.06      0.08       179
                            Azure Fundamentals       0.11      0.10      0.10       142
Azure Fundamentals, Google Cloud Certification       0.18      0.23      0.20       150
                    Google Cloud Certification       0.16      0.19      0.17       162

                                      accuracy                           0.17      1000
                                     macro avg       0.16      0.17      0.16      1000
                                  weighted avg       0.16      0.17      0.16      1000



In [42]:
# ---------- Certifications Model ----------
y_cert = df["Certifications"]
X_train_ct, X_test_ct, y_train_ct, y_test_ct = train_test_split(X, y_cert, test_size=0.2, random_state=42)

cert_model = LogisticRegression(max_iter=200)
cert_model.fit(X_train_ct, y_train_ct)
y_pred_ct = cert_model.predict(X_test_ct)
print("Certification Recommender Evaluation:\n", classification_report(y_test_ct, y_pred_ct))

Certification Recommender Evaluation:
                                           precision    recall  f1-score   support

                           AWS Certified       0.17      0.26      0.21       174
       AWS Certified, Azure Fundamentals       0.00      0.00      0.00       104
     AWS Certified, Google Data Engineer       0.00      0.00      0.00        87
                      Azure Fundamentals       0.09      0.13      0.11       142
       Azure Fundamentals, AWS Certified       0.00      0.00      0.00        89
Azure Fundamentals, Google Data Engineer       0.00      0.00      0.00        81
                    Google Data Engineer       0.14      0.45      0.21       162
     Google Data Engineer, AWS Certified       0.00      0.00      0.00        92
Google Data Engineer, Azure Fundamentals       0.00      0.00      0.00        69

                                accuracy                           0.14      1000
                               macro avg       0.04      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
# ---------- Save Models ----------
joblib.dump(course_model, COURSE_MODEL_NAME)
joblib.dump(cert_model, CERT_MODEL_NAME)
joblib.dump(tfidf, TFIDF_NAME)

['IT_coursecert_tfidf.pkl']