In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Load the data
jobreq_data = pd.read_csv("cleaned_vacancies_data.csv")
cv_data = pd.read_csv("cleaned_cv_data.csv")

Translate the datasets first to avoid biased points of similarities

In [None]:
from googletrans import Translator
from langdetect import detect
# Initialize the translator
translator = Translator()

# Function to detect and translate text
def translate_text(text):
    try:
        # Detect the language
        lang = detect(text)
        # Translate only if the text is in Indonesian
        if lang == 'id':
            translated = translator.translate(text, src='id', dest='en')
            return translated.text
        else:
            # Return the original text if it's not in Indonesian
            return text
    except Exception as e:
        # If detection or translation fails, return the original text
        return text
# Translate jobreq_data descriptions
jobreq_data['description'] = jobreq_data['description'].apply(translate_text)

# Translate cv_data text
cv_data['Text'] = cv_data['Text'].apply(translate_text)

calculate the similarity of the jobreq (job requirement) dataset and the cv's dataset

In [3]:
# Initialize the TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Concatenate all descriptions and texts to fit the vectorizer
all_texts = pd.concat([jobreq_data['description'], cv_data['Text']])
vectorizer.fit(all_texts)

# Transform the job descriptions and CV texts to their TF-IDF representation
jobreq_tfidf = vectorizer.transform(jobreq_data['description'])
cv_tfidf = vectorizer.transform(cv_data['Text'])

# Calculate cosine similarity using TensorFlow
def cosine_similarity_tf(tfidf_matrix1, tfidf_matrix2):
    tfidf_matrix1 = tf.constant(tfidf_matrix1.toarray(), dtype=tf.float32)
    tfidf_matrix2 = tf.constant(tfidf_matrix2.toarray(), dtype=tf.float32)
    dot_product = tf.linalg.matmul(tfidf_matrix1, tfidf_matrix2, transpose_b=True)
    norm_matrix1 = tf.sqrt(tf.reduce_sum(tf.square(tfidf_matrix1), axis=1, keepdims=True))
    norm_matrix2 = tf.sqrt(tf.reduce_sum(tf.square(tfidf_matrix2), axis=1, keepdims=True))
    norm_product = tf.linalg.matmul(norm_matrix1, norm_matrix2, transpose_b=True)
    similarity = dot_product / norm_product
    return similarity

In [4]:
# Calculate the similarity
similarity_matrix = cosine_similarity_tf(jobreq_tfidf, cv_tfidf)
similarity_matrix_np = similarity_matrix.numpy()

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix_np, index=jobreq_data.index, columns=cv_data.index)

In [None]:
similarity_df

Load the library for training the collaborative filtering model for candidate ranking

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dot, Flatten
from tensorflow.keras.models import Model

In [7]:
# Load the similarity matrix
similarity_matrix_np = similarity_matrix.numpy()

# Define function to prepare data for collaborative filtering
def prepare_data(similarity_matrix):
    # Get the indices of the top recommendations for each job
    top_candidates_indices = np.argsort(similarity_matrix, axis=1)[:, ::-1]
    # Create DataFrame to hold the user-item interactions
    interactions = []
    for job_idx, top_candidates in enumerate(top_candidates_indices):
        for candidate_idx in top_candidates:
            interactions.append([job_idx, candidate_idx, similarity_matrix[job_idx, candidate_idx]])
    cv_data_with_points = pd.DataFrame(interactions, columns=['job_idx', 'candidate_idx', 'points'])
    return cv_data_with_points

In [8]:
# Prepare data for collaborative filtering
cv_data_with_points = prepare_data(similarity_matrix_np)

# Convert indices to continuous IDs for embedding purposes
cv_data_with_points['job_idx'] = cv_data_with_points['job_idx'].astype('category').cat.codes
cv_data_with_points['candidate_idx'] = cv_data_with_points['candidate_idx'].astype('category').cat.codes

# Number of unique job requirements and candidates
n_jobs = cv_data_with_points['job_idx'].nunique()
n_candidates = cv_data_with_points['candidate_idx'].nunique()

# Define the input layers
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# Define the embedding layers
user_embedding = Embedding(n_candidates, 20, embeddings_initializer='uniform', name='user_embedding')(user_input)
item_embedding = Embedding(n_jobs, 20, embeddings_initializer='uniform', name='item_embedding')(item_input)

# Flatten the embedding layers
user_vecs = Flatten()(user_embedding)
item_vecs = Flatten()(item_embedding)

# Compute the dot product of the embeddings
y = Dot(axes=1)([user_vecs, item_vecs])

# Create the model
model = Model(inputs=[user_input, item_input], outputs=y)
model.compile(optimizer='adam', loss='mse')

# Prepare the data for training
X = cv_data_with_points[['candidate_idx', 'job_idx']].values
y = cv_data_with_points['points'].values

# Train the model
model.fit([X[:, 0], X[:, 1]], y, epochs=128, batch_size=128, verbose=1)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

<keras.src.callbacks.History at 0x1748f047d90>

In [9]:
# Save the model to an .h5 file
model.save('collaborative_filtering_model.h5')

  saving_api.save_model(


In [11]:
from tensorflow.keras.models import load_model
# Load the saved model
model = load_model('collaborative_filtering_model.h5')

Make a recommendation function to print the recommended candidate's cv

In [12]:
# Define a function to recommend CVs for a job requirement using the trained collaborative filtering model
def recommend_cvs_for_job(jobreq_id, model, vectorizer, jobreq_data, cv_data, top_n=5):
    # Transform the job description to its TF-IDF representation
    jobreq_tfidf = vectorizer.transform([jobreq_data.loc[jobreq_id, 'description']])
    # Get the indices of CVs
    cv_indices = np.arange(len(cv_data))
    # Repeat the job requirement index for all CVs
    job_indices = np.repeat(jobreq_id, len(cv_data))
    # Predict the points using the model
    predicted_points = model.predict([cv_indices, job_indices])
    # Get the top recommended CVs
    top_cvs_indices = predicted_points.squeeze().argsort()[::-1][:top_n]
    # Get the CV texts and similarity scores
    top_cvs_with_scores = []
    for cv_idx in top_cvs_indices:
        cv_text = cv_data.loc[cv_idx, 'Text']
        similarity_score = predicted_points[cv_idx]
        top_cvs_with_scores.append((cv_text, similarity_score))
    return top_cvs_with_scores

In [None]:
# Example: get top 5 CVs for a specific job requirement using the collaborative filtering model
jobreq_id = 0  # Replace with a job requirement index for testing

# you can print the code below to check the regarding jobreq_text
#jobreq_text = jobreq_data.iloc[jobreq_id]['description']
#print(jobreq_text)
top_cvs_with_scores = recommend_cvs_for_job(jobreq_id, model, vectorizer, jobreq_data, cv_data)

print("Top CVs for Job Requirement ID {}: ".format(jobreq_id))
for cv_text, similarity_score in top_cvs_with_scores:
    print(f"CV: {cv_text}\nSimilarity Score: {similarity_score}\n")