In [None]:
!pip install pdfplumber
!pip install pandas openpyxl transformers torch scikit-learn

import pdfplumber
import re
import os
import openpyxl
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW # Added AdamW
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from torch.utils.data import TensorDataset, DataLoader # Added imports

In [3]:


# Function to clean the extracted text
def clean_text(text):
    # Remove multiple spaces, newlines, etc.
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple whitespaces into one
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        # Extract text from all pages
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to save extracted data into an Excel file
def save_to_excel(data, output_path):
    # Create a new Excel workbook and sheet
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "CV Texts"

    # Write the header
    ws.append(["File Name", "Extracted Text"])  # Column headers

    # Write each PDF's name and extracted text into a row
    for file_name, text in data.items():
        ws.append([file_name, text])

    # Save the workbook
    wb.save(output_path)

# Main function to process all PDF files in a folder
def process_pdfs_in_folder(folder_path, excel_output_path):
    extracted_data = {}

    # Loop over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):  # Process only PDF files
            pdf_path = os.path.join(folder_path, filename)

            # Extract text from the current PDF file
            extracted_text = extract_text_from_pdf(pdf_path)

            # Clean the extracted text
            cleaned_text = clean_text(extracted_text)

            # Store the extracted data (file name as key, cleaned text as value)
            extracted_data[filename] = cleaned_text

    # Save all extracted data to an Excel file
    save_to_excel(extracted_data, excel_output_path)
    print(f"Text extracted and saved to {excel_output_path}")

# Specify your folder path containing the PDFs and the output Excel file path
folder_path = '/content/CV/'
excel_output_path = '/content/CV Text.xlsx'  # Output path for the Excel file

# Run the process
process_pdfs_in_folder(folder_path, excel_output_path)


Text extracted and saved to /content/CV Text.xlsx


In [17]:



def load_and_fine_tune_model(train_texts, train_labels, epochs=3):
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1) # num_labels should match the number of classes in your dataset

    # Prepare training data
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)

    # Convert train_labels to numerical labels if they are strings
    # Assuming you want to assign a unique numerical label to each file name
    unique_labels = list(set(train_labels))
    label_map = {label: i for i, label in enumerate(unique_labels)}
    numerical_labels = [label_map[label] for label in train_labels]

    train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                                  torch.tensor(train_encodings['attention_mask']),
                                  torch.tensor(numerical_labels)) # Use numerical labels here
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # Adjust batch_size as needed

    # Fine-tune the model
    optimizer = AdamW(model.parameters(), lr=2e-5)
    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels.unsqueeze(1).float()) # Adjust labels shape if necessary
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    return tokenizer, model

# Load Excel data into a pandas DataFrame
def load_excel_data(excel_path):
    df = pd.read_excel(excel_path)
    return df

# Function to load a pre-trained BERT model
def load_model():
    model_name = 'bert-base-uncased'  # You can also use other models like 'distilbert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name)
    model.eval()  # Set the model to evaluation mode
    return tokenizer, model

# Function to encode text using BERT tokenizer
def encode_text(text, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    return inputs

# Function to get embeddings for the CVs and job descriptions
def get_embeddings(texts, tokenizer, model):
    embeddings = []
    with torch.no_grad():
        for text in texts:
            inputs = encode_text(text, tokenizer)
            outputs = model(**inputs)
            # Reshape the embeddings to 2D by taking the mean of the first dimension
            # This assumes the first dimension represents different aspects of the embedding
            # and we want to average them to get a single representation
            embeddings.append(outputs.logits.detach().numpy().mean(axis=1))
    return np.array(embeddings)

# Function to rank the CVs based on similarity to a job description
def rank_cvs(job_desc, cv_texts, tokenizer, model):
    # Get the embeddings for the job description and CVs
    job_desc_embedding = get_embeddings([job_desc], tokenizer, model)
    cv_embeddings = get_embeddings(cv_texts, tokenizer, model)

    # Compute cosine similarity between the job description and each CV
    similarities = cosine_similarity(job_desc_embedding, cv_embeddings)
    return similarities[0]


# Main function to perform CV ranking
def rank_cvs_in_excel(excel_path, job_desc):
    # Load the Excel file
    df = load_excel_data(excel_path)

    # Extract the CV texts from the Excel file
    cv_texts = df['Extracted Text'].tolist()

    # Extract labels for fine-tuning (if available)
    cv_labels = df['File Name'].tolist() if 'File Name' in df.columns else None

    # Load and fine-tune the model or load the pre-trained model
    if cv_labels:
        tokenizer, model = load_and_fine_tune_model(cv_texts, cv_labels)
    else:
        tokenizer, model = load_model()

    # Rank the CVs based on the job description (using the loaded/fine-tuned model)
    similarities = rank_cvs(job_desc, cv_texts, tokenizer, model)

    # Add the similarity scores to the DataFrame
    df['Similarity Score'] = similarities

    # Sort the CVs by similarity score (higher score = more relevant)
    ranked_df = df.sort_values(by='Similarity Score', ascending=False)

    # Save the ranked CVs to a new Excel file
    ranked_df.to_excel('ranked_cvs.xlsx', index=False)

    # Print out the top-ranked CVs
    print("Top ranked CVs:")
    print(ranked_df[['File Name','Extracted Text', 'Similarity Score']].head())
    return model
# Example usage
job_desc = """
ob Summary:
We are seeking a motivated and detail-oriented member to join our team. In this role, you will bridge the gap between IT and business operations, using data analytics to assess processes, determine requirements, and deliver data-driven recommendations to stakeholders. This is an excellent opportunity for recent graduates to develop their skills and contribute to meaningful projects.

Key Responsibilities:
Requirement Gathering: Work closely with stakeholders to understand and document business needs and translate them into technical requirements.
System Analysis and Design: Analyze existing systems for improvement, propose new system processes, and assist in designing solutions that meet business needs.
Implementation and Support: Assist in the implementation of new systems or updates to existing systems. Provide support and troubleshooting for system issues.
Data Analysis: Use data analytics tools to gather and analyze data, generate reports, and provide insights to support decision-making.
Documentation: Develop and maintain system documentation, including user manuals and technical guides.
Testing and Quality Assurance: Participate in system testing to ensure solutions meet business requirements and are free of defects.
Training: Assist in training end-users on new systems and processes.
Continuous Improvement: Stay updated with the latest technology trends and suggest improvements to enhance system efficiency and effectiveness.
Qualifications:
Education: Bachelor’s degree in Business Administration, Information Technology, Computer Science, or a related field.
Technical Skills: Basic understanding of databases, networking, and software development. Familiarity with data analysis tools like Excel, SQL, or Tableau is a plus.
Analytical Skills: Strong analytical and problem-solving skills to interpret data and develop actionable insights.
Communication Skills: Excellent written and verbal communication skills to effectively interact with stakeholders and team members.
Teamwork: Ability to work collaboratively in a team-oriented environment.
Attention to Detail: High level of accuracy and attention to detail in documentation and analysis.
Benefits:
Competitive salary and benefits package
Professional development opportunities
Mentorship and training programs
Collaborative and supportive work environment
Opportunities for career advancement
"""

excel_path = '/content/Training.xlsx'  # Path to your Excel file

# Rank the CVs in the Excel file based on the provided job description
model = rank_cvs_in_excel(excel_path, job_desc)  # model is now assigned


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Top ranked CVs:
  File Name                                     Extracted Text  \
0       Bad  Name: Emily Davis Contact Information: • Phone...   
1      Good  Name: Sarah Johnson Contact Information: • Pho...   
2      Good  Name: Alex Turner Contact Information: • Phone...   
3       Bad  Emily Davis Contact Information: • Phone: +61 ...   
4      Good  Sarah Johnson Contact Information: • Phone: +6...   

   Similarity Score  
0               1.0  
1               1.0  
2               1.0  
3               1.0  
4               1.0  


In [20]:
def rank_new_cvs(job_desc, cv_texts, model_path='fine_tuned_model'):
    """Ranks new CVs using a saved fine-tuned model."""

    # Load the fine-tuned model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    model.eval()  # Set the model to evaluation mode

    # Get embeddings and rank CVs
    similarities = rank_cvs(job_desc, cv_texts, tokenizer, model)
    return similarities

# Example usage:
# 1. Save the fine-tuned model:
# Assuming you have already trained the model and assigned it to the `model` variable
model.save_pretrained('fine_tuned_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.save_pretrained('fine_tuned_model')
# 2. Load new CV texts:
new_cv_texts = [
    "I have strong experience in Python and machine learning.",
    "I am proficient in data analysis and SQL."
]

# 3. Rank the new CVs:
similarities = rank_new_cvs(job_desc, new_cv_texts)

# 4. Process the results (e.g., print the rankings):
for i, similarity in enumerate(similarities):
    print(f"CV {i + 1}: Similarity Score = {similarity}")

CV 1: Similarity Score = 1.0
CV 2: Similarity Score = 1.0
