# **Loading model and Vectorizer**

In [None]:
import joblib

In [None]:
# Load the trained model
model = joblib.load('naive_bayes_resume_classifier.pkl')

# Load the TF-IDF vectorizer
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

print("Model and vectorizer loaded successfully!")

Model and vectorizer loaded successfully!


In [None]:
model

In [None]:
tfidf_vectorizer

# **Resume Extraction**

In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


Function to Extract text from resumes

In [None]:
import os
from PyPDF2 import PdfReader

# Define the folder containing resumes
input_folder = "/content/drive/MyDrive/NLP/Project/Resume_pdf"

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        text = text.replace("\t", " ").replace("\n", " ")
        text = text.lower()
        return text
    except:
        return "error with PDF file"  # Return an empty string if any error occurs

#### Text Preprocessing

In [None]:
import re
from nltk.corpus import stopwords


import nltk
nltk.download('stopwords')

def preprocess_text(text):
    # Step 1: Lowercase the text
    text = text.lower()
    # Step 2: Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Step 3: Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Step 4: Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Main Function

In [None]:
import shutil
import os

output_folder = "/content/Result"


for resume_file in os.listdir(input_folder):
    file_path = os.path.join(input_folder, resume_file)
    print(f"Processing file: {resume_file}")
    print(f"File path: {file_path}")


    if resume_file.endswith(".pdf"):
        try:
            resume_text = extract_text_from_pdf(file_path)

            if not isinstance(resume_text, str) or not resume_text.strip():
                raise ValueError(f"Text extraction failed or empty for {resume_file}")

            # Preprocess text
            clean_text = preprocess_text(resume_text)
            print(f"Clean text for {resume_file}: {clean_text[:100]}...")  # Print the first 100 characters


            # Transform text into features
            features = tfidf_vectorizer.transform([clean_text])

            # Predict designation
            predicted_label = model.predict(features)[0]
            print(f"Predicted label for {resume_file}: {predicted_label}")

            # Create folder for designation if it doesn't exist
            designation_folder = os.path.join(output_folder, str(predicted_label))  # Ensure label is a string
            os.makedirs(designation_folder, exist_ok=True)

            # Copy the resume to the respective folder
            shutil.copy(file_path, designation_folder)
            print(f"{resume_file} moved to {designation_folder}")

        except Exception as e:
            print(f"Error processing {resume_file}: {e}")


Processing file: KandaceLoudor_Resume.pdf
File path: /content/drive/MyDrive/NLP/Project/Resume_pdf/KandaceLoudor_Resume.pdf
Clean text for KandaceLoudor_Resume.pdf: k n c e l u r c e n c n c kloudoremailcom mount laurel nj linkedin github e u c n bs statistics rutg...
Predicted label for KandaceLoudor_Resume.pdf: 0
KandaceLoudor_Resume.pdf moved to /content/Result/0
Processing file: TrishMathers_Resume.pdf
File path: /content/drive/MyDrive/NLP/Project/Resume_pdf/TrishMathers_Resume.pdf
Clean text for TrishMathers_Resume.pdf: trish mathers entrylevel data scientist innovative scientically rigorous graduate signicant data sci...
Predicted label for TrishMathers_Resume.pdf: 0
TrishMathers_Resume.pdf moved to /content/Result/0
Processing file: TerrenceColeman_Resume.pdf
File path: /content/drive/MyDrive/NLP/Project/Resume_pdf/TerrenceColeman_Resume.pdf
Clean text for TerrenceColeman_Resume.pdf: terrence coleman tcolemanemailcom brooklyn ny linkedin analytically minded selfstarter decade ex