In [4]:
import PyPDF2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib

def extract_skills(text, keywords):

    skills = []
    for keyword in keywords:
        pattern = rf"\b{keyword}\s*(\w+\s*){{0,2}}"
        skills.extend(re.findall(pattern, text, re.IGNORECASE))
    return skills

def preprocess_text(text):

    text = re.sub(r'[^\w\s]', '', text).lower()
    return text

def train_classifier(X, y):

    vectorizer = TfidfVectorizer()
    X_vec = vectorizer.fit_transform(X)
    classifier = SVC(kernel='linear')
    classifier.fit(X_vec, y)
    return vectorizer, classifier

def analyze_cv_with_ml(pdf_path, vectorizer, classifier, keywords):

    text = extract_text_from_pdf(pdf_path)
    preprocessed_text = preprocess_text(text)
    skills_text = " ".join(extract_skills(preprocessed_text, keywords))
    X = vectorizer.transform([skills_text])
    job_role = classifier.predict(X)[0]
    return job_role

def extract_text_from_pdf(pdf_path):

    text = ""
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text


labeled_data = [
    {"cv_text": "Text from CV 1...", "job_role": "Data Scientist"},
    {"cv_text": "Text from CV 2...", "job_role": "Software Engineer"},
    {"cv_text": "Text from CV 3...", "job_role": "Web Developer"},

]


skill_keywords = ["python", "java", "javascript", "html", "css", "sql", "machine learning", "deep learning"]


X_train = [sample["cv_text"] for sample in labeled_data]
y_train = [sample["job_role"] for sample in labeled_data]

vectorizer, classifier = train_classifier(X_train, y_train)


joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(classifier, 'classifier.pkl')

cv_file = "/content/CV-ANISH DAS.pdf" # Path of the CV pdf should be pasted here ---


job_role = analyze_cv_with_ml(cv_file, vectorizer, classifier, skill_keywords)
print("Predicted Job Role:", job_role)


Predicted Job Role: Web Developer


In [2]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
