In [None]:
# cleaning part 1

from datasets import load_dataset
import pandas as pd

ds = load_dataset("facehuggerapoorv/resume-jd-match")

jd_list = []
resume_list = []
label_list = []

separator = ">> the resume: <<"
prefix_to_remove = "For the given job description "

for example in ds['train']:
    text = example['text']
    label = example['label']

    if separator in text:
        jd_part, resume_part = text.split(separator, 1)


        jd_cleaned = jd_part.replace(prefix_to_remove, "").strip("<>").strip()
        resume_cleaned = resume_part.strip("<>").strip()
    else:
        jd_cleaned = ""
        resume_cleaned = ""

    jd_list.append(jd_cleaned)
    resume_list.append(resume_cleaned)
    label_list.append(label)


df = pd.DataFrame({
    'jd': jd_list,
    'resume': resume_list,
    'label': label_list
})

df.to_csv('jd_resume_cleaned.csv', index=False)

print("Saved to 'jd_resume_cleaned.csv' with cleaned JD text.")


In [None]:
# nlp preprocessing -- 2


import pandas as pd
import spacy
import string
import re

import en_core_web_sm
nlp = en_core_web_sm.load()

df = pd.read_csv('/content/jd_resume_cleaned.csv')

def preprocess_text(text):
    if pd.isnull(text):
        return ""


    text = text.lower()

    text = re.sub(r'\S+@\S+', '', text)

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = re.sub(r'\d+', '', text)

    text = text.translate(str.maketrans('', '', string.punctuation))

    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and len(token.text) > 2 and not token.is_punct]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

df['processed_jd'] = df['jd'].apply(preprocess_text)
df['processed_resume'] = df['resume'].apply(preprocess_text)

print(df[['jd', 'processed_jd', 'resume', 'processed_resume', 'label']].head())

df.to_csv('jd_resume_cleaned_preprocessed.csv', index=False)

print(" Preprocessing complete! Cleaned file saved as 'jd_resume_cleaned_preprocessed.csv'")



In [None]:
# ML Classifiers without tuning --- 3

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/content/jd_resume_cleaned_preprocessed.csv')


le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])


tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['processed_resume'])
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'SVM - Linear': SVC(kernel='linear', probability=True),
    'SVM - RBF': SVC(kernel='rbf', probability=True),
    'SVM - Poly': SVC(kernel='poly', degree=3, probability=True),
    'Bagging (Logistic Regression)': BaggingClassifier(estimator=LogisticRegression(max_iter=1000), n_estimators=10, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}


results = []

for model_name, model in models.items():
    print(f"\n Training: {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_pred_labels = le.inverse_transform(y_pred)
    y_true_labels = le.inverse_transform(y_test)

    results.append({
        'Model': model_name,
        'Accuracy': accuracy_score(y_true_labels, y_pred_labels),
        'Precision': precision_score(y_true_labels, y_pred_labels, average='weighted', zero_division=0),
        'Recall': recall_score(y_true_labels, y_pred_labels, average='weighted'),
        'F1-Score': f1_score(y_true_labels, y_pred_labels, average='weighted')
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)

print("\n Final Comparison Table:")
print(results_df.to_string(index=False))


In [None]:
# ML Classifiers with tuning --- 4

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.base import clone
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('/content/jd_resume_cleaned_preprocessed.csv')


le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['processed_resume'])
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'SVM - Linear': SVC(kernel='linear', probability=True),
    'SVM - RBF': SVC(kernel='rbf', probability=True),
    'SVM - Poly': SVC(kernel='poly', degree=3, probability=True),
    'Bagging (Logistic Regression)': BaggingClassifier(estimator=LogisticRegression(max_iter=1000), n_estimators=10, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

param_grids = {
    'Logistic Regression': {
        'C': uniform(0.01, 10),
        'penalty': ['l2']
    },
    'Naive Bayes': {
        'alpha': uniform(0.0, 1.0)
    },
    'SVM - Linear': {
        'C': uniform(0.1, 10),
        'kernel': ['linear']
    },
    'SVM - RBF': {
        'C': uniform(0.1, 10),
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf']
    },
    'Random Forest': {
        'n_estimators': randint(50, 200),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': randint(50, 150),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3)
    },
    'LightGBM': {
        'n_estimators': randint(50, 150),
        'num_leaves': randint(20, 100),
        'learning_rate': uniform(0.01, 0.3)
    },
    'CatBoost': {
        'iterations': randint(50, 150),
        'depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3)
    }
}

results = []

for model_name, model in models.items():
    print(f"\n Tuning and Training: {model_name}")
    if model_name in param_grids:
        param_dist = param_grids[model_name]
        search = RandomizedSearchCV(
            estimator=clone(model),
            param_distributions=param_dist,
            n_iter=10,
            cv=3,
            scoring='accuracy',
            random_state=42,
            n_jobs=-1
        )
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_model = model

    y_pred = best_model.predict(X_test)

    y_pred_labels = le.inverse_transform(y_pred)
    y_true_labels = le.inverse_transform(y_test)

    results.append({
        'Model': model_name,
        'Accuracy': accuracy_score(y_true_labels, y_pred_labels),
        'Precision': precision_score(y_true_labels, y_pred_labels, average='weighted', zero_division=0),
        'Recall': recall_score(y_true_labels, y_pred_labels, average='weighted'),
        'F1-Score': f1_score(y_true_labels, y_pred_labels, average='weighted')
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)

print("\n Final Comparison Table After Tuning:")
print(results_df.to_string(index=False))


In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------DL-------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
pip install datasets
pip install tensorflow numpy pandas scikit-learn

In [None]:
from datasets import load_dataset
import pandas as pd


ds = load_dataset("facehuggerapoorv/resume-jd-match")

jd_list = []
resume_list = []
label_list = []

separator = ">> the resume: <<"
prefix_to_remove = "For the given job description "

for example in ds['train']:
    text = example['text']
    label = example['label']

    if separator in text:
        jd_part, resume_part = text.split(separator, 1)

        jd_cleaned = jd_part.replace(prefix_to_remove, "").strip("<>").strip()
        resume_cleaned = resume_part.strip("<>").strip()
    else:
        jd_cleaned = ""
        resume_cleaned = ""

    jd_list.append(jd_cleaned)
    resume_list.append(resume_cleaned)
    label_list.append(label)

df = pd.DataFrame({
    'jd': jd_list,
    'resume': resume_list,
    'label': label_list
})

df.to_csv('jd_resume_cleaned.csv', index=False)

print("Saved to 'jd_resume_cleaned.csv' with cleaned JD text.")


In [None]:
df.head()

In [None]:

import re

def clean_text(text):

    text = text.lower()

    text = re.sub(r'\S+@\S+', '', text)

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    text = re.sub(r'^(professional summary|summary|overview)[\s:]*', '', text.strip(), flags=re.IGNORECASE)
    return text
df['resume'] = df['resume'].apply(clean_text)
df['jd'] = df['jd'].apply(clean_text)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

label_map = {'Good Fit': 0, 'No Fit': 1, 'Potential Fit': 2}
df['label'] = df['label'].map(label_map)

print("NaNs in df['label']:", df['label'].isnull().any())
df = df.dropna(subset=['label'])

X_resume = df['resume'].values
X_jd = df['jd'].values
y = df['label'].values

X_resume_train, X_resume_val, X_jd_train, X_jd_val, y_train, y_val = train_test_split(
    X_resume, X_jd, y, test_size=0.2, random_state=42)


In [None]:
import numpy as np
print("NaNs in y_train:", np.any(pd.isnull(y_train)))
print("Unique labels:", np.unique(y_train))
print("unique labels ", np.unique(df['label']))

In [None]:
# tokenizaion and padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


vocab_size = 20000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(list(X_resume) + list(X_jd))


X_resume_seq_train = tokenizer.texts_to_sequences(X_resume_train)
X_jd_seq_train = tokenizer.texts_to_sequences(X_jd_train)
X_resume_seq_val = tokenizer.texts_to_sequences(X_resume_val)
X_jd_seq_val = tokenizer.texts_to_sequences(X_jd_val)


maxlen = 512
X_resume_pad_train = pad_sequences(X_resume_seq_train, maxlen=maxlen, padding='post')
X_jd_pad_train = pad_sequences(X_jd_seq_train, maxlen=maxlen, padding='post')
X_resume_pad_val = pad_sequences(X_resume_seq_val, maxlen=maxlen, padding='post')
X_jd_pad_val = pad_sequences(X_jd_seq_val, maxlen=maxlen, padding='post')


In [None]:



embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vectors

print(f"Found {len(embedding_index)} word vectors.")


In [None]:
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam

=
resume_input = Input(shape=(maxlen,), name='Resume_Input')
jd_input = Input(shape=(maxlen,), name='JD_Input')


embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=maxlen,
    trainable=False
)

resume_embedded = embedding_layer(resume_input)
jd_embedded = embedding_layer(jd_input)

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# %% Model 0: LSTM
resume_lstm = LSTM(64)(resume_embed)
jd_lstm = LSTM(64)(jd_embed)

x4 = Concatenate()([resume_lstm, jd_lstm])
x4 = BatchNormalization()(x4)
x4 = Dense(128, activation='relu')(x4)
x4 = Dropout(0.4)(x4)
x4 = Dense(64, activation='relu')(x4)
x4 = Dropout(0.3)(x4)
output4 = Dense(3, activation='softmax')(x4)

model_lstm = Model(inputs=[resume_input, jd_input], outputs=output4)
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])


# %% Model 1: BiLSTM
resume_bilstm = Bidirectional(LSTM(64))(resume_embed)
jd_bilstm = Bidirectional(LSTM(64))(jd_embed)

x1 = Concatenate()([resume_bilstm, jd_bilstm])
x1 = BatchNormalization()(x1)
x1 = Dense(128, activation='relu')(x1)
x1 = Dropout(0.4)(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.3)(x1)
output1 = Dense(3, activation='softmax')(x1)

model_bilstm = Model(inputs=[resume_input, jd_input], outputs=output1)
model_bilstm.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])

# %% Model 2: BiGRU
resume_bigru = Bidirectional(GRU(64))(resume_embed)
jd_bigru = Bidirectional(GRU(64))(jd_embed)

x2 = Concatenate()([resume_bigru, jd_bigru])
x2 = BatchNormalization()(x2)
x2 = Dense(128, activation='relu')(x2)
x2 = Dropout(0.4)(x2)
x2 = Dense(64, activation='relu')(x2)
x2 = Dropout(0.3)(x2)
output2 = Dense(3, activation='softmax')(x2)

model_bigru = Model(inputs=[resume_input, jd_input], outputs=output2)
model_bigru.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])

# %% Model 3: Hybrid BiLSTM + BiGRU
resume_bilstm_h = Bidirectional(LSTM(64))(resume_embed)
jd_bigru_h = Bidirectional(GRU(64))(jd_embed)

x3 = Concatenate()([resume_bilstm_h, jd_bigru_h])
x3 = BatchNormalization()(x3)
x3 = Dense(128, activation='relu')(x3)
x3 = Dropout(0.4)(x3)
x3 = Dense(64, activation='relu')(x3)
x3 = Dropout(0.3)(x3)
output3 = Dense(3, activation='softmax')(x3)

model_hybrid = Model(inputs=[resume_input, jd_input], outputs=output3)
model_hybrid.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])

# %% Train All Models
print("\nTraining LSTM Model")
history_lstm = model_lstm.fit(
    [X_resume_pad_train, X_jd_pad_train], y_train,
    validation_data=([X_resume_pad_val, X_jd_pad_val], y_val),
    epochs=10, batch_size=32, callbacks=[early_stop]
)

print("\nTraining BiLSTM Model")
history_bilstm = model_bilstm.fit(
    [X_resume_pad_train, X_jd_pad_train], y_train,
    validation_data=([X_resume_pad_val, X_jd_pad_val], y_val),
    epochs=10, batch_size=32, callbacks=[early_stop]
)

print("\nTraining BiGRU Model")
history_bigru = model_bigru.fit(
    [X_resume_pad_train, X_jd_pad_train], y_train,
    validation_data=([X_resume_pad_val, X_jd_pad_val], y_val),
    epochs=10, batch_size=32, callbacks=[early_stop]
)

print("\nTraining Hybrid Model")
history_hybrid = model_hybrid.fit(
    [X_resume_pad_train, X_jd_pad_train], y_train,
    validation_data=([X_resume_pad_val, X_jd_pad_val], y_val),
    epochs=10, batch_size=32, callbacks=[early_stop]
)

# %% Evaluate All Models
def evaluate(model, name):
    print(f"\n--- Evaluation for {name} ---")
    y_pred = model.predict([X_resume_pad_val, X_jd_pad_val])
    y_pred_class = np.argmax(y_pred, axis=1)
    print(classification_report(y_val, y_pred_class, target_names=['Good Fit', 'No Fit', 'Potential Fit']))
    print(confusion_matrix(y_val, y_pred_class))

evaluate(model_lstm, "LSTM")
evaluate(model_bilstm, "BiLSTM")
evaluate(model_bigru, "BiGRU")
evaluate(model_hybrid, "Hybrid BiLSTM+GRU")



In [None]:
# HYPER PARAMETER TUNING ON BILSTM

def build_lstm_model(hp):
    lstm_units = hp.Int('lstm_units', min_value=32, max_value=128, step=32)
    dense_units = hp.Int('dense_units', min_value=64, max_value=256, step=64)
    dropout_rate1 = hp.Float('dropout1', 0.3, 0.4, step=0.1)
    dropout_rate2 = hp.Float('dropout2', 0.3, 0.4, step=0.1)
    learning_rate = hp.Choice('learning_rate', [1e-3, 1e-4])

    resume_bilstm = Bidirectional(LSTM(lstm_units))(resume_embed)
    jd_bilstm = Bidirectional(LSTM(lstm_units))(jd_embed)

    x = Concatenate()([resume_bilstm, jd_bilstm])
    x = BatchNormalization()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate1)(x)
    x = Dense(dense_units // 2, activation='relu')(x)
    x = Dropout(dropout_rate2)(x)
    output = Dense(3, activation='softmax')(x)

    model = Model(inputs=[resume_input, jd_input], outputs=output)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(learning_rate),
        metrics=['accuracy']
    )
    return model

tuner = kt.Hyperband(
    build_lstm_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='kt_dir',
    project_name='bilstm_tuning'
)

tuner.search(
    [X_resume_pad_train, X_jd_pad_train], y_train,
    validation_data=([X_resume_pad_val, X_jd_pad_val], y_val),
    epochs=10,
    callbacks=[early_stop],
    verbose=1
)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("\nBest Hyperparameters:")
for param in best_hps.values:
    print(f"{param}: {best_hps.get(param)}")

model_bilstm = tuner.hypermodel.build(best_hps)
history = model_bilstm.fit(
    [X_resume_pad_train, X_jd_pad_train], y_train,
    validation_data=([X_resume_pad_val, X_jd_pad_val], y_val),
    epochs=10,
    callbacks=[early_stop]
)

def evaluate(model, name):
    print(f"\n--- Evaluation for {name} ---")
    y_pred = model.predict([X_resume_pad_val, X_jd_pad_val])
    y_pred_class = np.argmax(y_pred, axis=1)
    print(classification_report(y_val, y_pred_class, target_names=['Good Fit', 'No Fit', 'Potential Fit']))
    print(confusion_matrix(y_val, y_pred_class))

evaluate(model_bilstm, "Tuned BiLSTM")

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------ATS SCORE CALCULATION-------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# tfidf



import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_csv('/content/jd_resume_cleaned_preprocessed.csv')


print(df.columns)

jd_texts = df['processed_jd'].tolist()
resume_texts = df['processed_resume'].tolist()

tfidf_vectorizer = TfidfVectorizer(max_features=1000)

tfidf_vectorizer.fit(jd_texts + resume_texts)

jd_tfidf = tfidf_vectorizer.transform(jd_texts)
resume_tfidf = tfidf_vectorizer.transform(resume_texts)

similarity_scores = []

for i in range(len(df)):
    sim = cosine_similarity(jd_tfidf[i], resume_tfidf[i])[0][0]
    similarity_scores.append(sim)

df['similarity_score'] = similarity_scores

print(df[['jd', 'resume', 'similarity_score']].head())

df.to_csv('jd_resume_with_similarity.csv', index=False)

print(" Cosine similarity applied successfully! File saved as 'jd_resume_with_similarity.csv'")


In [None]:
# using llm

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv("jd_resume_cleaned.csv")

stop_words = set(stopwords.words("english"))

def preprocess(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^A-Za-z\s]", " ", text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return " ".join(tokens)

df["processed_jd"] = df["jd"].apply(preprocess)
df["processed_resume"] = df["resume"].apply(preprocess)

model_names = [
    "all-mpnet-base-v2",
    "paraphrase-MiniLM-L6-v2",
    "all-MiniLM-L12-v1",
    "all-roberta-large-v1",
    "bert-base-nli-mean-tokens"
]

similarity_data = {"label": df["label"]}

for model_name in model_names:
    print(f"\n🔍 Encoding with {model_name}...")
    model = SentenceTransformer(model_name)

    jd_embeddings = model.encode(df["processed_jd"].tolist(), convert_to_tensor=True, show_progress_bar=True)
    resume_embeddings = model.encode(df["processed_resume"].tolist(), convert_to_tensor=True, show_progress_bar=True)

    sim_scores = util.cos_sim(jd_embeddings, resume_embeddings).diagonal().cpu().numpy()
    similarity_data[f"{model_name}_score"] = sim_scores

result_df = pd.DataFrame(similarity_data)

summary_stats = result_df.groupby("label").agg(["count", "mean", "median", "std", "min", "max"])

result_df.to_csv("ats_similarity_scores_all_models.csv", index=False)
summary_stats.to_csv("ats_similarity_stats_by_label.csv")

print("\n📊 Similarity Score Summary Stats by Fit Label:")
print(summary_stats)

In [None]:
# UI
# stream lit final code
import streamlit as st
import google.generativeai as genai
import json
import re
import time
import random
import fitz  # PyMuPDF

# Configure Gemini API
genai.configure(api_key="AIzaSyAw_eSjW1d9Z5u8RWZ_GJ8OEU_oDoUfmgE")
model = genai.GenerativeModel("models/gemini-1.5-pro-latest")

# Utility Functions
def clean_json_block(text):
    match = re.search(r"(?:json)?\s*(.*?)\s*", text, re.DOTALL)
    if not match:
        raise ValueError("No valid JSON block found in Gemini response.")
    return match.group(1).strip()

def extract_keywords(jd_text):
    prompt = f"""
    Extract the most important hard and soft skills, tools, and technologies from the following job description.
    Return the result as a JSON list of keywords (e.g., ["Python", "Machine Learning", "Communication"]).

    JD:
    {jd_text}

    Wrap the list in triple backticks so it is easy to parse.
    """
    response = model.generate_content(prompt)
    text = response.text.strip()
    return json.loads(clean_json_block(text))

def generate_variations(keywords):
    prompt = f"""
    Given the following list of keywords, return a JSON dictionary where each keyword maps to 3-5 variations or synonyms.
    Example format: {{ "Python": ["Python 3", "Python programming", "Scripting"], ... }}

    Keywords: {keywords}

    Wrap the dictionary in triple backticks.
    """
    response = model.generate_content(prompt)
    text = response.text.strip()
    return json.loads(clean_json_block(text))

def calculate_ats_score(resume, variations_dict):
    resume = resume.lower()
    matched = 0
    total = 0
    missing_keywords = []

    for keyword, variations in variations_dict.items():
        all_terms = [keyword.lower()] + [v.lower() for v in variations]
        found = any(re.search(r'\b' + re.escape(term) + r'\b', resume) for term in all_terms)
        if found:
            matched += 1
        else:
            missing_keywords.append(keyword)
        total += 1

    score = round((matched / total) * 100, 2) if total else 0
    return score, missing_keywords

def safe_gemini_call(func, *args, retries=3, delay=2):
    for attempt in range(retries):
        try:
            result = func(*args)
            time.sleep(delay + random.uniform(0.5, 1.5))
            return result
        except Exception as e:
            if "429" in str(e):
                time.sleep(delay * (attempt + 1))
            else:
                raise
    raise RuntimeError(f"Failed after {retries} retries: {func._name_}")

def extract_text_from_pdf(uploaded_pdf):
    with fitz.open(stream=uploaded_pdf.read(), filetype="pdf") as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# ---------------------- Streamlit UI ----------------------

st.set_page_config(page_title="ATS Score Checker", layout="centered")
st.title(" Resume vs JD ATS Score Checker")
st.markdown("Upload a *resume (PDF)* and a *job description (TXT)* to check your ATS match score.")

resume_file = st.file_uploader(" Upload Resume (PDF only)", type=["pdf"])
jd_file = st.file_uploader("Upload Job Description (TXT only)", type=["txt"])

if st.button("Calculate ATS Score") and resume_file and jd_file:
    try:
        resume_text = extract_text_from_pdf(resume_file)
        jd_text = jd_file.read().decode("utf-8", errors="ignore")

        with st.spinner("Extracting keywords and scoring..."):
            keywords = safe_gemini_call(extract_keywords, jd_text)
            variations = safe_gemini_call(generate_variations, keywords)
            score, missing_keywords = calculate_ats_score(resume_text, variations)


        st.success(f" ATS Score: *{score}%*")
        if score >= 70:
            st.markdown(" *Great match!* Your resume aligns well with the JD.")
        elif score >= 50:
            st.markdown("*Moderate match.* Consider tweaking your resume to better align with the JD.")
        else:
            st.markdown(" *Low match.* Improve your resume by including more relevant skills or keywords.")

        st.markdown("---")
        st.subheader(" Extracted Keywords from JD")
        cols = st.columns(3)
        for i, keyword in enumerate(keywords):
            with cols[i % 3]:
                st.markdown(f" *{keyword}*")

        # ----- Keyword Variations -----
        st.markdown("---")
        st.subheader(" Keyword Variations / Synonyms")
        for keyword, vars in variations.items():
            st.markdown(f"🔹 *{keyword}*: {', '.join(vars)}")

        # ----- Missing Keywords -----
        st.markdown("---")
        st.subheader(" Missing Keywords (Consider adding to your resume)")
        if missing_keywords:
            missing_cols = st.columns(3)
            for i, keyword in enumerate(missing_keywords):
                with missing_cols[i % 3]:
                    st.markdown(f" *{keyword}*")
        else:
            st.success(" All JD keywords are present in your resume!")

    except Exception as e:
        st.error(f"An error occurred: {e}")
else:
    st.info("Please upload both *Resume (PDF)* and *JD (TXT)* to calculate the score.")