In [38]:
import pandas as pd
import re
import string
import spacy
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [39]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [40]:

# ---------- Parameters ----------
DATASET_PATH = "resume_data_Non_IT_5000_updated.csv"
SKILL_LIST = ['communication', 'excel', 'salesforce', 'customer support', 'team management', 'public speaking']
COURSE_MODEL_NAME = "NonIT_course_model.pkl"
CERT_MODEL_NAME = "NonIT_cert_model.pkl"
TFIDF_NAME = "NonIT_coursecert_tfidf.pkl"

In [41]:
# ---------- Text Preprocessing ----------
def clean_text(text):
    text = re.sub(r"<[^>]+>", "", str(text))
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

def extract_skills(text, skill_list):
    return [skill for skill in skill_list if skill in text]

def get_missing_skills(resume_skills, required_skills_str):
    required_skills = [skill.strip().lower() for skill in str(required_skills_str).split(',')]
    return list(set(required_skills) - set(resume_skills))

In [42]:
# ---------- Load and Process Data ----------
df = pd.read_csv(DATASET_PATH)

In [43]:
# Fill missing text columns
df["Keywords"] = df["Keywords"].fillna("")
df["Technologies"] = df["Technologies"].fillna("")
df["Skills"] = df["Skills"].fillna("")
df["Courses"] = df["Courses"].fillna("")
df["Certifications"] = df["Certifications"].fillna("")

In [44]:
# Combine text from resume fields
df["combined"] = df["Keywords"] + " " + df["Technologies"]
df["cleaned"] = df["combined"].apply(clean_text).apply(lemmatize_text)

In [45]:
# Extract skills from resume
df["skills_extracted"] = df["cleaned"].apply(lambda x: extract_skills(x, SKILL_LIST))

In [46]:
# Compute missing skills using required skills in 'Skills' column
df["missing_skills"] = df.apply(lambda row: get_missing_skills(row["skills_extracted"], row["Skills"]), axis=1)
df["missing_skills_str"] = df["missing_skills"].apply(lambda x: " ".join(x))

In [47]:
# TF-IDF feature extraction from missing skills
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["missing_skills_str"])

In [48]:
# ----------------- Course Model -----------------
y_course = df["Courses"]
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_course, test_size=0.2, random_state=42)

course_model = LogisticRegression(max_iter=200)
course_model.fit(X_train_c, y_train_c)
y_pred_c = course_model.predict(X_test_c)
print("Course Recommender Evaluation:\n", classification_report(y_test_c, y_pred_c))

Course Recommender Evaluation:
                                                   precision    recall  f1-score   support

                           Advanced Excel Course       0.14      0.31      0.20       163
Advanced Excel Course, Customer Service Training       0.00      0.00      0.00       175
                       Customer Service Training       0.00      0.00      0.00       153
                             Salesforce Training       0.18      0.65      0.29       186
      Salesforce Training, Advanced Excel Course       0.00      0.00      0.00       169
  Salesforce Training, Customer Service Training       0.00      0.00      0.00       154

                                        accuracy                           0.17      1000
                                       macro avg       0.05      0.16      0.08      1000
                                    weighted avg       0.06      0.17      0.09      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
# ----------------- Certification Model -----------------
y_cert = df["Certifications"]
X_train_ct, X_test_ct, y_train_ct, y_test_ct = train_test_split(X, y_cert, test_size=0.2, random_state=42)

cert_model = LogisticRegression(max_iter=200)
cert_model.fit(X_train_ct, y_train_ct)
y_pred_ct = cert_model.predict(X_test_ct)
print("Certification Recommender Evaluation:\n", classification_report(y_test_ct, y_pred_ct))

Certification Recommender Evaluation:
                                                precision    recall  f1-score   support

                      Customer Service Expert       0.00      0.00      0.00       153
      Customer Service Expert, Excel Advanced       0.00      0.00      0.00        92
Customer Service Expert, Salesforce Certified       0.00      0.00      0.00        77
                               Excel Advanced       0.14      0.31      0.20       163
      Excel Advanced, Customer Service Expert       0.00      0.00      0.00        83
         Excel Advanced, Salesforce Certified       0.00      0.00      0.00        89
                         Salesforce Certified       0.18      0.65      0.29       186
Salesforce Certified, Customer Service Expert       0.00      0.00      0.00        77
         Salesforce Certified, Excel Advanced       0.00      0.00      0.00        80

                                     accuracy                           0.17      1000
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
# ----------------- Save Models -----------------
joblib.dump(course_model, COURSE_MODEL_NAME)
joblib.dump(cert_model, CERT_MODEL_NAME)
joblib.dump(tfidf, TFIDF_NAME)

['NonIT_coursecert_tfidf.pkl']