<a href="https://colab.research.google.com/github/EternalKnight002/Drummusic/blob/main/ResumeScreeningModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install joblib PyPDF2 scikit-learn nltk
import nltk
nltk.download('stopwords')


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import os
import re
import joblib
import pandas as pd
from typing import List, Dict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords

# stopwords
_stop_words = set(stopwords.words('english'))

def clean_text(text: str, remove_stopwords: bool = True) -> str:
    """Lightweight cleaning: remove URLs, non-alphanumeric chars, lowercase and optionally remove stopwords."""
    if not isinstance(text, str):
        text = str(text or "")
    text = re.sub(r'https?://\S+|www\.\S+|\S+@\S+', ' ', text)  # URLs & emails
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # special chars
    text = text.lower()
    tokens = text.split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in _stop_words]
    return " ".join(tokens)


In [5]:
def train_from_csv(csv_path: str,
                   text_column: str = "Skills",
                   label_column: str = "Recruiter Decision",
                   max_features: int = 5000,
                   output_dir: str = "./artifacts"):

    print(f"[train] loading CSV: {csv_path}")
    df = pd.read_csv(csv_path)

    if text_column not in df.columns or label_column not in df.columns:
        raise ValueError(f"CSV must contain {text_column}, {label_column}. Found: {df.columns.tolist()}")

    print("[train] cleaning text...")
    df['cleaned_text'] = df[text_column].fillna("").apply(clean_text)

    print("[train] vectorizing...")
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(df['cleaned_text'])
    y = df[label_column].astype(str)

    print("[train] splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y)

    print("[train] training LogisticRegression...")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"[train] accuracy on test set: {acc:.4f}")

    os.makedirs(output_dir, exist_ok=True)
    model_path = os.path.join(output_dir, "skill_match_model.joblib")
    vectorizer_path = os.path.join(output_dir, "tfidf_vectorizer.joblib")

    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)

    print(f"[train] saved model -> {model_path}")
    print(f"[train] saved vectorizer -> {vectorizer_path}")
    return model, vectorizer


In [6]:
import PyPDF2

def rank_resumes_against_job_desc(vectorizer,
                                  model,
                                  job_description: str,
                                  resumes_dir: str) -> List[Dict]:

    cleaned_job = clean_text(job_description)
    job_vec = vectorizer.transform([cleaned_job])

    results = []
    for fname in sorted(os.listdir(resumes_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(resumes_dir, fname)
        try:
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                text = ""
                for p in reader.pages:
                    text += (p.extract_text() or "")
        except Exception as e:
            print(f"[rank] failed to read {fname}: {e}")
            continue

        cleaned_resume = clean_text(text)
        resume_vec = vectorizer.transform([cleaned_resume])
        score = float(cosine_similarity(resume_vec, job_vec)[0, 0])
        results.append({"filename": fname, "score": score})

    results = sorted(results, key=lambda x: x['score'], reverse=True)
    return results


In [12]:

from google.colab import drive
import os
import shutil
import sys


try:
    drive.mount('/content/drive', force_remount=False)
except Exception as e:
    print("Warning: could not mount Drive automatically:", e)


FILENAME = "AI_Resume_Screening.csv"
possible_paths = [
    f"/content/drive/MyDrive/{FILENAME}",
    f"/content/drive/MyDrive/datasets/{FILENAME}",
    f"/content/drive/MyDrive/dataset/{FILENAME}",
    f"/content/drive/MyDrive/data/{FILENAME}",
    f"/content/drive/MyDrive/datasets/ai/{FILENAME}",
    f"/content/drive/MyDrive/Projects/{FILENAME}",
    f"/content/drive/MyDrive/AI/{FILENAME}",
    f"/content/{FILENAME}",
]

csv_found = None
for p in possible_paths:
    if os.path.exists(p):
        csv_found = p
        break


if csv_found is None and os.path.exists("/content/drive/MyDrive"):
    print("[search] CSV not in common paths, searching /content/drive/MyDrive (this may take a few seconds)...")
    for root, dirs, files in os.walk("/content/drive/MyDrive"):
        if FILENAME in files:
            csv_found = os.path.join(root, FILENAME)
            print(f"[search] Found CSV at: {csv_found}")
            break

if csv_found is None:
    raise FileNotFoundError(
        f"Could not find {FILENAME} in Drive or /content. Please upload it to Drive or use files.upload() and set csv_for_training accordingly."
    )

print("[info] using CSV:", csv_found)


local_copy = f"/content/{FILENAME}"
if not os.path.exists(local_copy):
    print(f"[copy] copying {csv_found} -> {local_copy} ...")
    shutil.copyfile(csv_found, local_copy)
else:
    print("[copy] local copy already exists:", local_copy)

csv_for_training = local_copy


print("[train-start] training from:", csv_for_training)
try:
    model, vectorizer = train_from_csv(csv_for_training)
except Exception as e:
    print("[train-error] Training failed:", e)
    raise

print("[train-done] Model and vectorizer created in-session.")


drive_artifacts_dir = "/content/drive/MyDrive/skillmatch_artifacts"
try:
    if os.path.exists("./artifacts"):
        print(f"[save] copying ./artifacts -> {drive_artifacts_dir}")
        shutil.copytree("./artifacts", drive_artifacts_dir, dirs_exist_ok=True)
        print("[save] Artifacts copied to Drive:", drive_artifacts_dir)
    else:
        print("[save] No ./artifacts directory found to copy.")
except Exception as e:
    print("[save-warning] Could not copy artifacts to Drive:", e)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[info] using CSV: /content/drive/MyDrive/AI_Resume_Screening.csv
[copy] local copy already exists: /content/AI_Resume_Screening.csv
[train-start] training from: /content/AI_Resume_Screening.csv
[train] loading CSV: /content/AI_Resume_Screening.csv
[train] cleaning text...
[train] vectorizing...
[train] splitting data...
[train] training LogisticRegression...
[train] accuracy on test set: 0.8100
[train] saved model -> ./artifacts/skill_match_model.joblib
[train] saved vectorizer -> ./artifacts/tfidf_vectorizer.joblib
[train-done] Model and vectorizer created in-session.
[save] copying ./artifacts -> /content/drive/MyDrive/skillmatch_artifacts
[save] Artifacts copied to Drive: /content/drive/MyDrive/skillmatch_artifacts


In [14]:
from google.colab import files

# Upload a job description text file
uploaded = files.upload()
job_desc_path = list(uploaded.keys())[0]

# Upload resumes (one or more PDFs) into a directory
!mkdir -p resumes
uploaded = files.upload()
for k, v in uploaded.items():
    with open(os.path.join("resumes", k), "wb") as f:
        f.write(v)


Saving Job_desc_java_dev.txt to Job_desc_java_dev.txt


Saving Resume_JavaDeveloper_4.pdf to Resume_JavaDeveloper_4.pdf
Saving Resume_JavaDeveloper_5.pdf to Resume_JavaDeveloper_5.pdf
Saving Resume_DataAnalyst_6.pdf to Resume_DataAnalyst_6.pdf
Saving Resume_JavaDeveloper_3.pdf to Resume_JavaDeveloper_3.pdf
Saving Resume_DataAnalyst_1.pdf to Resume_DataAnalyst_1.pdf
Saving Resume_DataAnalyst_2.pdf to Resume_DataAnalyst_2.pdf
Saving Resume_DataAnalyst_4.pdf to Resume_DataAnalyst_4.pdf
Saving Resume_DataAnalyst_5.pdf to Resume_DataAnalyst_5.pdf
Saving Resume_JavaDeveloper_1.pdf to Resume_JavaDeveloper_1.pdf
Saving Resume_DataAnalyst_3.pdf to Resume_DataAnalyst_3.pdf
Saving Resume2_DataAnalyst_Experienced.pdf to Resume2_DataAnalyst_Experienced.pdf
Saving Resume4_JavaDeveloper_Experienced.pdf to Resume4_JavaDeveloper_Experienced.pdf
Saving Resume1_DataAnalyst_EntryLevel.pdf to Resume1_DataAnalyst_EntryLevel.pdf
Saving Resume3_JavaDeveloper_EntryLevel.pdf to Resume3_JavaDeveloper_EntryLevel.pdf


In [15]:
with open(job_desc_path, "r", encoding="utf-8") as f:
    job_description = f.read()

results = rank_resumes_against_job_desc(vectorizer, model, job_description, "resumes")
df_results = pd.DataFrame(results)
df_results['match_percentage'] = (df_results['score'] * 100).round(2)
df_results


Unnamed: 0,filename,score,match_percentage
0,Resume3_JavaDeveloper_EntryLevel.pdf,0.985068,98.51
1,Resume4_JavaDeveloper_Experienced.pdf,0.985068,98.51
2,Resume_JavaDeveloper_1.pdf,0.985068,98.51
3,Resume_JavaDeveloper_3.pdf,0.985068,98.51
4,Resume_JavaDeveloper_4.pdf,0.985068,98.51
5,Resume_JavaDeveloper_5.pdf,0.985068,98.51
6,Resume1_DataAnalyst_EntryLevel.pdf,0.0,0.0
7,Resume2_DataAnalyst_Experienced.pdf,0.0,0.0
8,Resume_DataAnalyst_1.pdf,0.0,0.0
9,Resume_DataAnalyst_2.pdf,0.0,0.0


In [16]:
df_results.to_csv("ranked_resumes.csv", index=False)
files.download("ranked_resumes.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>