In [None]:
import os
import pdfplumber
import pandas as pd
data_folder = "data"

resume_data = []

for category in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category)
    if os.path.isdir(category_path):  
        for file_name in os.listdir(category_path):
            if file_name.endswith(".pdf"):  
                pdf_path = os.path.join(category_path, file_name)
                try:
                    with pdfplumber.open(pdf_path) as pdf:
                        text = ""
                        for page in pdf.pages:
                            text += page.extract_text() or ""  

                    resume_data.append({
                        "Category": category,
                        "ID": file_name,
                        "Content": text
                    })
                except Exception as e:
                    print(f"Error processing {pdf_path}: {e}")

df = pd.DataFrame(resume_data)

output_csv = "resumes_data.csv"
df.to_csv(output_csv, index=False)

print(f"Processed {len(resume_data)} resumes. Data saved to {output_csv}.")


Processed 2484 resumes. Data saved to resumes_data.csv.


In [2]:
import pandas as pd
df = pd.read_csv("./resumes_data.csv")

df["ID"] = df["ID"].str.replace(".pdf", "", regex=False)

df["ID"] = pd.to_numeric(df["ID"])

df.head()

Unnamed: 0,Category,ID,Content
0,ACCOUNTANT,10554236,ACCOUNTANT\nSummary\nFinancial Accountant spec...
1,ACCOUNTANT,10674770,STAFF ACCOUNTANT\nSummary\nHighly analytical a...
2,ACCOUNTANT,11163645,ACCOUNTANT\nProfessional Summary\nTo obtain a ...
3,ACCOUNTANT,11759079,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...
4,ACCOUNTANT,12065211,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...


In [3]:
print(df.loc[0, 'Content'])

ACCOUNTANT
Summary
Financial Accountant specializing in financial planning, reporting and analysis within the Department of Defense.
Highlights
Account reconciliations
Results-oriented Accounting operations professional
Financial reporting Analysis of financial systems
Critical thinking ERP (Enterprise Resource Planning) software.
Excellent facilitator
Accomplishments
Served on a tiger team which identified and resolved General Ledger postings in DEAMS totaling $360B in accounting adjustments. This allowed
for the first successful fiscal year-end close for 2012.
In collaboration with DFAS Europe, developed an automated tool that identified duplicate obligations. This tool allowed HQ USAFE to
deobligate over $5M in duplicate obligations.
Experience
Company Name July 2011 to November 2012 Accountant
City , State
Enterprise Resource Planning Office (ERO)
In this position as an Accountant assigned to the Defense Enterprise Accounting and Management System (DEAMS) ERO I was
responsible for 

In [9]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load pretrained model and tokenizer
model_name = "bert-base-uncased"  # Replace with any compatible transformer model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to extract embeddings
def extract_features(text, tokenizer, model):
    if not isinstance(text, str):
        text = str(text)  # Ensure the input is a string
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    # Use the [CLS] token embedding (first token)
    return outputs.last_hidden_state[0, 0, :].numpy()

# Ensure all entries in the 'Content' column are strings
df["Content"] = df["Content"].astype(str)

# Apply feature extraction to all rows in the DataFrame
df["Features"] = df["Content"].apply(lambda x: extract_features(x, tokenizer, model))

# Expand all feature vectors into individual columns
feature_dim = len(df["Features"][0])  # Determine the dimensionality of the feature vectors
feature_columns = [f"Feature_{i+1}" for i in range(feature_dim)]
features_expanded = pd.DataFrame(df["Features"].to_list(), columns=feature_columns)

# Combine the original columns with the expanded features
processed_df = pd.concat([df.drop(columns=["Features"]), features_expanded], axis=1)

# Save the processed DataFrame as a CSV file
processed_df.to_csv("processed_df.csv", index=False)

# Confirm the save
print("Processed DataFrame saved as 'processed_df.csv'.")


Processed DataFrame saved as 'processed_df.csv'.


In [11]:
processed_df.head()

Unnamed: 0,Category,ID,Content,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,...,Feature_759,Feature_760,Feature_761,Feature_762,Feature_763,Feature_764,Feature_765,Feature_766,Feature_767,Feature_768
0,ACCOUNTANT,10554236,ACCOUNTANT\nSummary\nFinancial Accountant spec...,-1.242447,-0.063286,-0.569169,-0.151814,-0.245817,0.261886,0.156179,...,0.147211,-0.770136,-0.207958,-0.491034,0.086247,0.164738,-0.139829,-0.125224,-0.091868,0.077088
1,ACCOUNTANT,10674770,STAFF ACCOUNTANT\nSummary\nHighly analytical a...,-0.629966,-0.210048,0.18262,0.292565,0.11887,0.038172,-0.058623,...,0.368035,-0.798873,-0.117339,-0.590444,0.489776,-0.01588,-0.757106,-0.157306,0.089326,0.254711
2,ACCOUNTANT,11163645,ACCOUNTANT\nProfessional Summary\nTo obtain a ...,-0.65076,-0.23842,0.107954,0.098538,0.131462,0.082952,-0.05039,...,0.355797,-0.611065,-0.702131,-0.597161,0.533199,0.152034,-0.767884,0.044046,-0.070561,-0.013307
3,ACCOUNTANT,11759079,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...,-0.999627,-0.069222,-0.626484,-0.156871,0.141907,-0.085315,0.063164,...,0.219695,-0.750287,-0.138343,-0.31649,0.140251,-0.188372,-0.16634,0.031788,0.249486,0.259235
4,ACCOUNTANT,12065211,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...,-0.631001,0.087391,-0.128692,-0.138944,-0.161178,-0.100275,-0.2296,...,0.392846,-0.815716,-0.011383,-0.688334,0.320781,-0.030218,-0.930548,-0.188591,0.433564,0.046786


In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.preprocessing import StandardScaler

# Load pretrained model and tokenizer (for job description)
model_name = "bert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Assuming processed_df is already read from the CSV and contains the data
# processed_df = pd.read_csv("resumes_processed_data.csv")  # This is assumed to be already done

def extract_features(text, tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state[0, 0, :].numpy()  # Use [CLS] token

def compute_similarity(features1, features2):
    return cosine_similarity([features1], [features2])[0][0]

def process_job_input(job_input, tokenizer, model):
    return extract_features(job_input, tokenizer, model)

category_input = "ENGINEERING"  
job_input = "Looking for a candidate with experience in Blockchain development."

# Extract job input features
job_features = process_job_input(job_input, tokenizer, model)

# Filter the DataFrame by the desired category
filtered_df = processed_df[processed_df["Category"] == category_input]

# Extract the feature columns (from 'Feature_1' to 'Feature_768')
feature_columns = [f"Feature_{i}" for i in range(1, 769)]  # Adjust if more or fewer features are present
resume_features = filtered_df[feature_columns].values  # Get the feature columns as a 2D numpy array

# Compute cosine similarity between job input features and resume features
similarity_scores = []
for resume_feature in resume_features:
    similarity_scores.append(compute_similarity(job_features, resume_feature))

# Add similarity scores to the DataFrame
filtered_df['Similarity'] = similarity_scores

# Sort the DataFrame by similarity scores in descending order
filtered_df_sorted = filtered_df.sort_values(by='Similarity', ascending=False)

# Select relevant columns to display
selected_resumes = filtered_df_sorted[['Category', 'ID', 'Similarity']]

# Display or save the results
print(selected_resumes)

# Optionally, save the results to a CSV file
selected_resumes.to_csv("sorted_resumes_with_similarity.csv", index=False)


NameError: name 'processed_df' is not defined