In [1]:
import numpy as np
import json
import re
from settings import settings
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import PyPDF2
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize


def remove_punctuation(text):
    punctuation_pattern = r"[^\w\s]"
    return re.sub(punctuation_pattern, "", text)


def extract_tokens_from_regular_input(query):
    return remove_punctuation(query.lower()).strip().split()


def extract_tokens_from_file_input(file):
    pdf_read = PyPDF2.PdfReader(file)
    resume_text = ""
    for page in pdf_read.pages:
        resume_text += page.extract_text() or ""
    return extract_tokens_from_regular_input(remove_punctuation(resume_text))


def extract_tokens_from_docs(doc):
    attrs = [doc["company"], doc["role"], doc["country"], doc["city"], doc["skills"]]
    return remove_punctuation(" ".join(attrs).lower())


def tokenize_docs(doc):
    return doc.split()


def construct_invertex_index(vectorizer, tfidf_matrix, n_components):
    feature_names = vectorizer.get_feature_names_out()
    print(feature_names)
    terms_indices = {
        term: idx for idx, term in enumerate(feature_names) if idx < n_components
    }
    inverted_index = defaultdict(list)
    rows, cols = tfidf_matrix.nonzero()
    for row, col in zip(rows, cols):
        # if col < n_components:  # Only include columns within the reduced dimensions
        inverted_index[feature_names[col]].append((row, tfidf_matrix[row, col]))
    return terms_indices, inverted_index


def construct_term_idf_map(vectorizer):
    return dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))


def construct_query_tfidf(query, idf_map):
    tokens = Counter(extract_tokens_from_regular_input(query))
    return {
        term: freq * idf_map.get(term, 0)
        for term, freq in tokens.items()
        if term in idf_map
    }


def compute_cosine_scores(query):
    query_tfidf = vectorizer.transform([query]).toarray()
    query_vec = normalize(np.dot(query_tfidf, words_compressed)).squeeze()
    sims = docs_compressed_normed.dot(query_vec)
    asort = np.argsort(-sims)[:200]
    return [(i, sims[i]) for i in asort[1:]]



def compute_greatest_sim(query):
    query_tfidf = vectorizer.transform([query]).toarray()
    query_vec = normalize(np.dot(query_tfidf, words_compressed)).squeeze()
    sims = docs_compressed_normed.multiply(query_vec)

def closest_projects_to_word(word_in, k=5):
    if word_in not in word_to_index:
        return []
    sims = docs_compressed_normed.dot(
        words_compressed_normed[word_to_index[word_in], :]
    )
    asort = np.argsort(-sims)[: k + 1]
    return [(i, documents[i][0], sims[i]) for i in asort[1:]]


# Load data and prepare TF-IDF and SVD
with open(settings.data_file_path, "r") as file:
    data = json.load(file)
documents = data.get("job_postings")

vectorizer = TfidfVectorizer(tokenizer=tokenize_docs, stop_words="english")

td_matrix = vectorizer.fit_transform(
    [extract_tokens_from_docs(doc) for doc in documents]
)

docs_compressed, s, words_compressed = svds(td_matrix, k=300)
words_compressed = words_compressed.transpose()

word_to_index = vectorizer.vocabulary_
index_to_word = {i: t for t, i in word_to_index.items()}

words_compressed_normed = normalize(words_compressed, axis=1)


td_matrix_np = td_matrix.transpose().toarray()
td_matrix_np = normalize(td_matrix_np)


docs_compressed_normed = normalize(docs_compressed)


In [2]:
from index import documents, compute_cosine_scores, extract_tokens_from_file_input


def group_postings_by_company(postings):
    groups = {}

    for p in postings:
        company = p["company"]

        if company in groups:
            groups[company]["postings"].append(p)
        else:
            groups[company] = {
                "name": company,
                "description": p["description"],
                "rating": p["rating"],
                "happiness": p["happiness"],
                "postings": [p],
            }

    return groups


def select_similar_k_docs(doc_scores_map, k=25):
    res = []

    for doc, score in doc_scores_map:
        document = documents[doc]
        document["score"] = score
        res.append(document)

        if k == 0:
            break

    return res


def get_postings_regular_input(text, k=25):
    cosine_scores = compute_cosine_scores(text)
    return select_similar_k_docs(cosine_scores, k)


def get_postings_file_input(file, k=25):
    file_tokens = extract_tokens_from_file_input(file)
    cosine_scores = compute_cosine_scores(" ".join(file_tokens))
    return select_similar_k_docs(cosine_scores, k)


In [247]:
query = "Proficiency in one or more backend programming languages (e.g., Java, Python, Node.js, Ruby) API development Database management (SQL or NoSQL) Server management and deployment Security best practices Knowledge of web frameworks (e.g., Express, Django)"

In [248]:
# query = "Highly skilled and compassionate Registered Nurse with over 7 years of experience in various healthcare settings. Proficient in providing comprehensive patient care, including assessment, planning, and implementation of nursing interventions. Strong ability to communicate effectively with patients, families, and interdisciplinary teams to ensure optimal patient outcomes. Experienced in managing diverse patient populations, ranging from pediatrics to geriatrics, with a focus on delivering evidence-based care. Proven track record of excellence in clinical skills, including medication administration, wound care, and patient education. Dedicated to continuous professional development and staying abreast of the latest advancements in nursing practice. Seeking to leverage my expertise and passion for patient care in a dynamic healthcare environment."

In [249]:
#Start with a query
query

'Proficiency in one or more backend programming languages (e.g., Java, Python, Node.js, Ruby) API development Database management (SQL or NoSQL) Server management and deployment Security best practices Knowledge of web frameworks (e.g., Express, Django)'

In [250]:
#Function to allow us to compare each individual job with the query by creating large array of repeat queries
def repeat_vector(vector, length):
    # Reshape the vector to be a row vector
    vector = np.array(vector).reshape(1, -1)
    
    # Use broadcasting to repeat the vector
    repeated_array = np.tile(vector, (length, 1))
    
    return repeated_array

In [263]:
#Create TFidf
query_tfidf = vectorizer.transform([query]).toarray()
query_vec = normalize(np.dot(query_tfidf, words_compressed)).squeeze()

#Create repeated query vector to analyze how much each index contributes in the dot product (cosine score)
length = len(docs_compressed_normed)
repeated_query_vec = repeat_vector(query_vec, length)

#Multiply by each document
sims = docs_compressed_normed *repeated_query_vec

#Extract out the indices that contribute the most to the cosine score and their values
largest_indices = np.argsort(-sims, axis=1)
largest_values = np.sort(-sims, axis=1)


In [264]:
#Now, extract the relation between the SVD terms and our query
#Extract all the words in the tokenizer
words = vectorizer.get_feature_names_out()
#Find out which value in our query is most related to each SVD index
max_indices = np.argmax(np.expand_dims(query_tfidf.squeeze(), axis=-1) * words_compressed_normed, axis=0)
#This creates our dictionary to display our statistics
word_representations = []
for i in max_indices:
    word_representations.append(words[i])

In [265]:
#Now calculuate the words that link the query to each job posting, the most similar words between them
most_sim_words = []
sim_scores = []
k = 5 #top k keys

n,m = largest_indices.shape
for i in range(n):
    #Track contribution of each word in their similarity score
    contribution_per_index = {word: 0 for word in word_representations}
    for j in range(m):
         contribution_per_index[word_representations[largest_indices[i,j]]] -= largest_values[i,j]   
    
    #Find top k related words
    top_k_words = sorted(contribution_per_index, key=contribution_per_index.get, reverse=True)[:k]
    
    #Track all the scores, this will show the percentage the top k related words contribute to the cosine score
    sorted_values = sorted(contribution_per_index.values(), reverse=True)
    
    #Track
    most_sim_words.append(top_k_words)
    sim_scores.append(sorted_values)

In [266]:
#Calculuate similarity scores
sims = docs_compressed_normed.dot(query_vec)
p = 10
#Find the top p postings
asort = np.argsort(-sims)[:p]
#Get their most similar words
result = [most_sim_words[i] for i in asort]
#Get the contribution of each word
result_scores = [sim_scores[i] for i in asort]

In [267]:
result

[['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'security', 'deployment']]

In [268]:
result_scores

[[0.1605850955224374,
  0.11954389323647326,
  0.10192842753713593,
  0.07137560046262677,
  0.07110194204759285,
  0.06013507384752758,
  0.04759127105422896,
  0.032375995633727096,
  0.027065218529239366,
  0.020538341848338477,
  0.019913855902829732,
  0.0194931158610988,
  0.019341526841560486,
  0.013199478364588403,
  0.01169251825094491,
  0.008753881015702044,
  0.006643672943305448],
 [0.1550773402847185,
  0.11611565935015258,
  0.09824833683573762,
  0.07029061521806557,
  0.06960791296102585,
  0.05941875012460573,
  0.04573359802746103,
  0.03225438791353622,
  0.027210103964448337,
  0.020726111409190534,
  0.02056799688652496,
  0.018881900080560406,
  0.018251402629760185,
  0.013729011713760219,
  0.012593978506605945,
  0.009639245081933066,
  0.006336817050718124],
 [0.15805159123216317,
  0.1205486188960036,
  0.09841297041159162,
  0.06470091913947988,
  0.06408199836115279,
  0.0557894552574084,
  0.04204533628898515,
  0.028598285554890038,
  0.0245709629925395

In [269]:
# Find the sum of total contributions for each word
sums = np.sum(np.abs(result_scores), axis=1)

# Calculate the percentage contribution of each word
percentages = (result_scores / sums[:, np.newaxis]) * 100


In [270]:
#Display top k
percentages = percentages[:,:k]
percentages

array([[19.79406758, 14.73523987, 12.56391932,  8.7979115 ,  8.76417977],
       [19.51436076, 14.61156647, 12.36320848,  8.84511187,  8.75920313],
       [20.44086029, 15.59058949, 12.72777935,  8.36778952,  8.28774431],
       [20.3461879 , 14.95391364, 12.80821054,  9.13080161,  8.4825035 ],
       [19.95508066, 14.99544548, 12.5114293 ,  8.97097415,  8.86250788],
       [20.23340327, 14.89944573, 12.80924041,  9.28551509,  8.7395426 ],
       [20.58738895, 15.04492227, 13.1999679 ,  9.12942609,  8.90236896],
       [20.51748579, 15.22438606, 13.47590891,  9.29739195,  9.0024642 ],
       [20.57150673, 14.90522588, 12.08699918,  9.77345563,  8.62149338],
       [17.90289332, 15.20042988, 13.74798843,  8.62904864,  8.25231331]])

In [271]:
result

[['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'security', 'deployment'],
 ['backend', 'server', 'database', 'deployment', 'security'],
 ['backend', 'server', 'database', 'security', 'deployment']]

In [272]:
#Now graph using a spider plot
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

In [273]:
data = {
    'word': result[0],
    'percent': percentages[0,0:5]
}
df = pd.DataFrame(data)

In [274]:
layout = go.Layout(title='Contribution Percentages of Top 5 Words in Your Resume')
# Example data
categories = ['A', 'B', 'C', 'D', 'E']
values = [4, 3, 2, 5, 4]

fig = go.Figure(layout=layout)
fig.add_trace(
    go.Scatterpolar(
        r=df['percent'],
        theta=df['word'],
        fill='toself',
        fillcolor='lightgreen',
        name='Contribution Percentages'
    )
)


fig.show()