In [1]:
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/akshaypatil/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [3]:
def text_preprocessing(text):
    """
    Cleaning and parsing the text.
    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    stop_words_set = set(stopwords.words('english'))
    filtered_text = [w for w in tokenized_text if w not in stop_words_set]
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(w) for w in filtered_text]
    combined_text = ' '.join(lemmatized_text)
    return combined_text

In [4]:
import fitz  # PyMuPDF

# Function to extract text from a single PDF file
def extract_text_from_pdf(file_path):
    document = fitz.open(file_path)
    text = ''
    for page in document:
        text += page.get_text()
    return text

In [5]:
# Preprocess resumes
resume_texts = []  # Store processed resume texts
resume_directory = '/Users/akshaypatil/Desktop/CSUF_courses/Final_Project/resume_scraping'

In [6]:
for filename in os.listdir(resume_directory):
    if filename.endswith(".pdf"):
        filepath = os.path.join(resume_directory, filename)
        resume_text = extract_text_from_pdf(filepath)
        cleaned_text = text_preprocessing(resume_text)
        resume_texts.append(cleaned_text)
        print(f"Processed resume: {filename}\n{cleaned_text}\n")

Processed resume: Eric_Ranner_php_developer.pdf.pdf
eric ranner skill php laravel javascript mysql mern stack mongodb expressjs react nodejs wordpress html cs docker container devops cloud photoshop microsoft sql server vba education mit xpro professional certificate coding full stack development mern july state university undergraduate coursework business managementmarketingeast stroudsburg university undergraduate coursework hotel restaurant tourism managementprojects beesnest web design website demo site full stack developer tech stack html cs javascript php wordpress showcase use custom theme custom plugins custom posttypesfearmonger full stack developer tech stack html cs figma javascript php bootstrap mysql demonstrates understanding user profile logins common website functionality amskier worker compensation form full stack developer tech stack html cs javascript sql bootstrap php mysql created integrated existing codebase demonstrates understanding integrating new code legacy c

In [7]:
import pandas as pd

# Load and preprocess job descriptions
job_description_file = '/Users/akshaypatil/Desktop/merged_JD.csv'
df = pd.read_csv(job_description_file)
# df.drop_duplicates(inplace=True)
# df.dropna(inplace=True)

In [8]:
df.shape

(1065, 2)

In [9]:
# Apply text preprocessing to each relevant column
df['Cleaned_Job_Description'] = df['Job Description'].apply(text_preprocessing)
# df['Cleaned_Skills'] = df['skills'].apply(text_preprocessing)
# df['Cleaned_Responsibilities'] = df['Responsibilities'].apply(text_preprocessing)

In [10]:
# Check the shape of the DataFrame
print(df.shape)

(1065, 3)


In [11]:
# Display or inspect the cleaned data
print(df[['Cleaned_Job_Description']])

                                Cleaned_Job_Description
0     locationsan francisco catypefull timedepartmen...
1     year building model business application exper...
2     applied scientist passion applying stateofthea...
3     research associate mathematical physical scien...
4     abbott global healthcare leader help people li...
...                                                 ...
1060  luminoat lumino mission unlock power ai every ...
1061  summaryposted dec hour swiftdata team seeking ...
1062  overviewif jazz employee please apply via inte...
1063  invehicle software engineering team responsibl...
1064  kodiak founded april develop autonomous techno...

[1065 rows x 1 columns]


In [12]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        with torch.no_grad():
            outputs = model(**inputs)
        # Use mean pooling for sentence embedding
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

# Combine all texts and get embeddings
all_texts = resume_texts + df['Cleaned_Job_Description'].tolist()
all_embeddings = get_bert_embeddings(all_texts)


In [13]:
from sklearn.cluster import KMeans

# Assuming the optimal number of clusters has been determined or is set to an arbitrary number like 10
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters)
labels = kmeans.fit_predict(all_embeddings)


In [14]:
cluster_centers = kmeans.cluster_centers_


In [15]:
import pandas as pd

# Assume 'all_texts' is a list of all texts that were used to create 'all_embeddings'
texts_df = pd.DataFrame({'text': all_texts, 'cluster': labels})

for i in range(n_clusters):
    print(f"Cluster {i}:")
    print(texts_df[texts_df['cluster'] == i]['text'].sample(min(10, len(texts_df[texts_df['cluster'] == i]))))
    print("\n")


Cluster 0:
708     jeremy dejuansalem oremail indeed seeking chal...
402     sarah n st tacoma wa professional profileexper...
1354    position title bioinformatics scientistclassif...
304     austin ferrellfranklin tnemail indeed work exp...
1702    looking smart qa engineer verify validate wire...
616     anthony medinaaustin txemail indeed mathematic...
807     logan hartmannsoftware engineer expertise c ja...
251     robinson rosadolas vega nvemail indeed fullsta...
294     minh nguyenel dorado hill caemail indeed year ...
1650    malin space science system ms private technolo...
Name: text, dtype: object


Cluster 1:
449     ruchit jigneshbhai desaibloomington inemail in...
1254    nucicer exciting opportunity applied quantitat...
101     brandon pricehead data science ml engineerings...
176     bisman singh educationuniversity colorado boul...
740     yiming chen linkedin education georgetown univ...
644     zuhair allahrakhacoral spring flemail indeed w...
618     navya mohan kk

In [16]:
from torch.utils.data import Dataset

class ResumeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure labels are long type for classification
        return item

    def __len__(self):
        return len(self.labels)

# Prepare labeled dataset using ResumeDataset
def create_dataset(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    # Correct this line to use the custom Dataset
    return ResumeDataset(encodings, labels)

# Usage of create_dataset
full_dataset = create_dataset(all_texts, labels)
train_size = int(0.9 * len(full_dataset))
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size])

In [17]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [20]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW

# Model setup
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_clusters)
optimizer = AdamW(model.parameters(), lr=2e-5)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_strategy='steps',    # Log every 'logging_steps'
    logging_steps=10,            # Log every 10 steps
    evaluation_strategy='steps', # Evaluate model every 'eval_steps'
    eval_steps=50,               # Evaluate every 50 steps
    load_best_model_at_end=True, # Load the best model at the end of training based on evaluation
    logging_dir='./logs',        # Directory for storing logs
)

# Trainer setup remains the same
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,2.2708,2.274907,0.134615,0.055265,0.040695,0.134615
100,2.1508,2.067248,0.355769,0.226678,0.187311,0.355769
150,2.0137,1.812059,0.413462,0.281785,0.219568,0.413462
200,1.7068,1.555219,0.5625,0.48998,0.497734,0.5625
250,1.4781,1.282252,0.615385,0.556089,0.591903,0.615385
300,1.2959,1.16972,0.591346,0.568083,0.675032,0.591346
350,1.2284,1.028569,0.682692,0.63917,0.675812,0.682692
400,0.8253,0.732552,0.802885,0.773543,0.756564,0.802885
450,0.9108,1.028389,0.668269,0.629294,0.699872,0.668269
500,0.6548,0.811115,0.764423,0.749821,0.778294,0.764423


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: t

TrainOutput(global_step=2335, training_loss=0.5945592428352166, metrics={'train_runtime': 4113.6804, 'train_samples_per_second': 2.27, 'train_steps_per_second': 0.568, 'total_flos': 2457633773199360.0, 'train_loss': 0.5945592428352166, 'epoch': 5.0})

In [35]:
%load_ext tensorboard
%tensorboard --logdir ./logs


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 89787), started 2 days, 19:16:44 ago. (Use '!kill 89787' to kill it.)

In [34]:
texts_df.to_csv("./clusters.csv")

In [24]:
# Assume training has completed, and you have your 'model' and 'tokenizer' objects

# Save the model and tokenizer using save_pretrained()
model_path = "./saved_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# You can later load the model and tokenizer from this directory
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)


In [25]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load the trained model from a specific directory
model = BertForSequenceClassification.from_pretrained('./saved_model', num_labels=10)
