In [2]:
!pip install pymupdf
!pip install imblearn
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import fitz  # PyMuPDF

# Function to clean text
def clean_text(text):
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Function to tokenize text
def tokenize_text(text):
    return word_tokenize(text)

# Function to remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = clean_text(text)  # Clean text
    tokens = tokenize_text(text)  # Tokenize text
    tokens = remove_stopwords(tokens)  # Remove stop words
    tokens = lemmatize_tokens(tokens)  # Lemmatize tokens
    return ' '.join(tokens)

# Provide the path to your Excel file
excel_file_path = '/content/Dataset - Dataset.csv'

# Load data from Excel
try:
    data = pd.read_csv(excel_file_path, encoding='latin1')
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"File not found: {excel_file_path}")
    exit()

# Drop rows with missing values in 'JD' column
data = data.dropna(subset=['JD'])

# Preprocess the text columns
data['cleaned_resume'] = data['Resumes'].apply(preprocess_text)
data['cleaned_jd'] = data['JD'].apply(preprocess_text)

# Combine the cleaned resumes and job descriptions
data['combined_text'] = data['cleaned_resume'] + ' ' + data['cleaned_jd']

# Define the TF-IDF Vectorizer with n-grams
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Vectorize the combined text data
X = vectorizer.fit_transform(data['combined_text'])
y = data['Result']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform hyperparameter tuning using GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

# Get the best model from grid search
best_rf_model = grid_search_rf.best_estimator_

# Train the best Random Forest model
best_rf_model.fit(X_train, y_train)

# Predict probabilities on the test data using Random Forest
y_pred_proba_rf = best_rf_model.predict_proba(X_test)[:, 1]

# Evaluate the Random Forest model (optional step)
y_pred_rf = best_rf_model.predict(X_test)
print("Random Forest model evaluation:")
print(classification_report(y_test, y_pred_rf))

# Use the best Random Forest model for prediction on the whole dataset
data['prediction'] = best_rf_model.predict(vectorizer.transform(data['combined_text']))
data['matching_score'] = best_rf_model.predict_proba(X)[:, 1]  # Predict matching score

# Print data with predictions and matching scores
print(data[['Resumes', 'JD', 'Result', 'prediction', 'matching_score']])


Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data loaded successfully.
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Random Forest model evaluation:
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         6
           1       0.80      0.67      0.73        12

    accuracy                           0.67        18
   macro avg       0.65      0.67      0.65        18
weighted avg       0.70      0.67      0.68        18

                                              Resumes  \
0   JOHANN BACH FrontEnd Developer Portland OR Lin...   
1   ALEKS LUDKEE FullStack Developer Nashville TN ...   
2   Madalin Auton IOS Developer Louisville KY Educ...   
3   YOUR NAME Phone Email Location City State ZIP ...   
4   Karen Santos Senior FrontEnd Developer Brookly...   
..                                                ...   
81  PRAJWAL RAMANNA VENKATESH SoftwareEngineer Jav...   
82  Mritunjay Pandey Thane Maharashtra India linke...   
83  SURAJ PATRA CONTACT Work email ID Lo

In [3]:

# Function to read and extract text from a PDF
def read_pdf(file_path):
    pdf_text = ""
    document = fitz.open(file_path)
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        pdf_text += page.get_text()
    return pdf_text

# Function to preprocess a single JD and resume PDF and make prediction
def test_single_jd_resume(resume_path, jd_path, model, vectorizer):
    # Read and preprocess the resume and job description
    resume_text = preprocess_text(read_pdf(resume_path))
    jd_text = preprocess_text(read_pdf(jd_path))

    # Combine the texts
    combined_text = resume_text + ' ' + jd_text

    # Vectorize the combined text
    combined_vector = vectorizer.transform([combined_text])

    # Predict using the trained model
    prediction = model.predict(combined_vector)[0]

    # Predict the matching score (probability of class 1)
    matching_score = model.predict_proba(combined_vector)[0][1]

    return prediction, matching_score

# Provide paths to the resume and job description PDFs
resume_path = '/content/Resume_Prasad.pdf'
jd_path = '/content/Job-desc-sample (1).pdf'

# Test single JD and resume
prediction, matching_score = test_single_jd_resume(resume_path, jd_path, best_rf_model, vectorizer)

# Print the prediction and matching score
print(f"Prediction: {prediction}")
print(f"Matching Score: {matching_score}")


Prediction: 0
Matching Score: 0.4380155873538224
