In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize




In [None]:
# Load the dataset
df = pd.read_csv('/content/UpdatedResumeDataSet.csv')

df.head()


In [None]:
#downloading Stopwords, punkt and wordnet
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')
import nltk
nltk.download('wordnet')

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Clean text function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_resume'] = df['Resume'].apply(clean_text)


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

#skill extraction function using NER
def extract_skills(text):
    doc = nlp(text)
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return skills

df['skills'] = df['cleaned_resume'].apply(extract_skills)


In [None]:
def extract_experience(text):
    years = re.findall(r'\d+ year', text.lower())
    if years:
        return int(max(years).split(' ')[0])
    return 0  # default if no experience is found

df['experience_years'] = df['cleaned_resume'].apply(extract_experience)


VECTORIZATION

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [None]:
!pip install huggingface_hub


In [None]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['cleaned_resume'])

# BERT Embedding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Using [CLS] token for embedding

df['bert_embedding'] = df['cleaned_resume'].apply(get_bert_embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

MODELING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['Category'], test_size=0.2)

# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Classification Accuracy:", accuracy)


Classification Accuracy: 1.0


In [None]:
#Clustering
from sklearn.cluster import KMeans

# Using TF-IDF matrix for clustering
kmeans = KMeans(n_clusters=5)
kmeans.fit(tfidf_matrix)
df['cluster'] = kmeans.labels_


Advanced NLP Techniques


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(resume_vector, job_description_vector):
    return cosine_similarity(resume_vector, job_description_vector)

# Example of calculating similarity
job_description_vector = tfidf.transform(["Data Science Job with ML and NLP skills"])  # Sample job description
similarity_scores = calculate_similarity(tfidf_matrix, job_description_vector)


Evaluation and Optimization


In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV

# Example using cross-validation with Logistic Regression
scores = cross_val_score(model, tfidf_matrix, df['Category'], cv=5)
print("Cross-Validation Scores:", scores)


Cross-Validation Scores: [0.97927461 1.         1.         1.         1.        ]
