In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, concatenate, Multiply, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# --- 1. Define Columns ---

# These are all the features your MLP (content path) will use
# Make sure this list matches your CSVs
SKILLS = ['C++', 'Python', 'Java', 'SQL', 'Excel', 'JavaScript', 'HTML', 'CSS', 'Machine Learning', 'Data Analysis', 'Project Management']

# Features from users.csv
USER_FEATURES = ['experience', 'salary_expectation'] + [f'{s}_proficiency' for s in SKILLS]

# Features from jobs.csv
JOB_FEATURES = ['experience_required', 'salary_min', 'salary_max'] + [f'{s}_expected' for s in SKILLS]

# All content features
CONTENT_FEATURES = USER_FEATURES + JOB_FEATURES


# --- 2. Data Preprocessing ---

print("Loading data...")
try:
    users_df = pd.read_csv('users_3.csv')
    jobs_df = pd.read_csv('jobs_3.csv')
    interactions_df = pd.read_csv('interactions_3.csv')
except FileNotFoundError:
    print("Error: Make sure users.csv, jobs.csv, and interactions.csv are in the same directory.")
    exit()

print("Merging data...")
# Merge all data into one training dataframe
df = interactions_df.merge(users_df, on='user_id')
df = df.merge(jobs_df, on='job_id')

# Get total unique users and jobs
n_users = df['user_id'].nunique()
n_jobs = df['job_id'].nunique()

# --- Scale Content Features ---
print("Scaling features...")
# We use StandardScaler for features that can be negative or positive around a mean
# (like if we created 'diff' features).
# For 0-10 skills and salaries, MinMaxScaler is also a great choice.
# Let's use StandardScaler for simplicity.
scaler = StandardScaler()
df[CONTENT_FEATURES] = scaler.fit_transform(df[CONTENT_FEATURES])

# --- 3. Train-Test Split ---
print("Splitting data...")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# --- Prepare Keras Inputs ---
# Keras models can take multiple inputs. We'll feed ours as a list.
def get_keras_inputs(dataframe):
    # This format is [input_1, input_2, input_3]
    return [
        dataframe['user_id'],
        dataframe['job_id'],
        dataframe[CONTENT_FEATURES].values
    ]

X_train = get_keras_inputs(train_df)
X_test = get_keras_inputs(test_df)

y_train = train_df['shortlisted']
y_test = test_df['shortlisted']


# --- 4. Build the Hybrid Model ---

print("Building the model...")

# Latent factor dimension for GMF path
EMBEDDING_SIZE = 32

# --- Path A: Collaborative (GMF) Path ---

# Input 1: User ID
user_id_input = Input(shape=(1,), name='user_id_input')
user_embedding = Embedding(input_dim=n_users,
                           output_dim=EMBEDDING_SIZE,
                           name='user_embedding')(user_id_input)
user_vec = Flatten(name='flatten_user_vec')(user_embedding)

# Input 2: Job ID
job_id_input = Input(shape=(1,), name='job_id_input')
job_embedding = Embedding(input_dim=n_jobs,
                          output_dim=EMBEDDING_SIZE,
                          name='job_embedding')(job_id_input)
job_vec = Flatten(name='flatten_job_vec')(job_embedding)

# GMF: Multiply the two latent vectors
gmf_output = Multiply(name='gmf_multiply')([user_vec, job_vec])


# --- Path B: Content (MLP) Path ---

# Input 3: Content Features
content_input = Input(shape=(len(CONTENT_FEATURES),), name='content_features_input')

# MLP Tower
mlp = Dense(64, activation='relu', name='mlp_dense_1')(content_input)
mlp = Dropout(0.3, name='mlp_dropout_1')(mlp)
mlp = Dense(32, activation='relu', name='mlp_dense_2')(mlp)
mlp_output = Dropout(0.3, name='mlp_dropout_2')(mlp)


# --- Final Step: Fusion & Prediction ---

# Concatenate the outputs of the two paths
fused_vector = concatenate([gmf_output, mlp_output], name='fusion_concatenate')

# Final prediction layer
output = Dense(1, activation='sigmoid', name='output_prediction')(fused_vector)

# Create the Model
model = Model(inputs=[user_id_input, job_id_input, content_input], outputs=output)

model.compile(optimizer=Adam(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Print a summary
model.summary()

# --- 5. Train the Model ---
print("\nTraining the model...")
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=10,  # Increase this if you have time/more data
    batch_size=256,
    verbose=1
)

print("\nTraining complete.")

# --- 6. Evaluate the Model ---
print("\nEvaluating on test set:")
test_loss, test_acc, test_auc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc*100:.2f}%")
print(f"Test AUC: {test_auc:.4f}")


# --- 7. How to Use the Model for Recommendations ---

def recommend_jobs_to_user(user_id, num_recs=10):
    """Recommends top N jobs for a given user_id."""
    print(f"\n--- Recommending top {num_recs} jobs for user_id {user_id} ---")

    # 1. Get user's features
    user_features = users_df[users_df['user_id'] == user_id]

    # 2. Create a "recommendation" dataframe:
    #    (user_id, job_id, user_features, job_features) for *all* jobs

    # Repeat user data for every job
    user_rec_df = pd.concat([user_features] * n_jobs, ignore_index=True)

    # Combine with all jobs
    rec_df = pd.concat([user_rec_df, jobs_df], axis=1)

    # 3. Scale the content features
    # IMPORTANT: Use the *same* scaler we fit during training
    rec_df[CONTENT_FEATURES] = scaler.transform(rec_df[CONTENT_FEATURES])

    # 4. Prepare Keras inputs
    keras_rec_inputs = get_keras_inputs(rec_df)

    # 5. Predict
    probabilities = model.predict(keras_rec_inputs).flatten()

    # 6. Rank
    rec_df['shortlist_prob'] = probabilities
    top_jobs = rec_df.sort_values(by='shortlist_prob', ascending=False)

    return top_jobs[['job_id', 'shortlist_prob', 'experience_required', 'salary_max']].head(num_recs)

def recommend_candidates_to_job(job_id, num_recs=10):
    """Recommends top N candidates for a given job_id."""
    print(f"\n--- Recommending top {num_recs} candidates for job_id {job_id} ---")

    # 1. Get job's features
    job_features = jobs_df[jobs_df['job_id'] == job_id]

    # 2. Create a "recommendation" dataframe

    # Repeat job data for every user
    job_rec_df = pd.concat([job_features] * n_users, ignore_index=True)

    # Combine with all users
    rec_df = pd.concat([users_df, job_rec_df], axis=1)

    # 3. Scale content features
    rec_df[CONTENT_FEATURES] = scaler.transform(rec_df[CONTENT_FEATURES])

    # 4. Prepare Keras inputs
    keras_rec_inputs = get_keras_inputs(rec_df)

    # 5. Predict
    probabilities = model.predict(keras_rec_inputs).flatten()

    # 6. Rank
    rec_df['shortlist_prob'] = probabilities
    top_users = rec_df.sort_values(by='shortlist_prob', ascending=False)

    return top_users[['user_id', 'shortlist_prob', 'experience', 'salary_expectation']].head(num_recs)

# --- Example Usage ---
# (Assumes user_id 123 and job_id 45 exist)

# 1. Get job recommendations for a user
top_job_recs = recommend_jobs_to_user(user_id=123, num_recs=10)
print(top_job_recs)

# 2. Get candidate recommendations for a job
top_candidate_recs = recommend_candidates_to_job(job_id=45, num_recs=10)
print(top_candidate_recs)

Loading data...
Merging data...
Scaling features...
Splitting data...
Building the model...



Training the model...
Epoch 1/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5618 - auc: 0.5491 - loss: 0.6987 - val_accuracy: 0.7064 - val_auc: 0.7909 - val_loss: 0.5836
Epoch 2/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6698 - auc: 0.7297 - loss: 0.5966 - val_accuracy: 0.8070 - val_auc: 0.8950 - val_loss: 0.4825
Epoch 3/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7423 - auc: 0.8215 - loss: 0.5155 - val_accuracy: 0.8894 - val_auc: 0.9601 - val_loss: 0.3580
Epoch 4/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8135 - auc: 0.8995 - loss: 0.4067 - val_accuracy: 0.9318 - val_auc: 0.9797 - val_loss: 0.2568
Epoch 5/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8539 - auc: 0.9324 - loss: 0.3327 - val_accuracy: 0.9476 - val_auc: 0.9859 - val_loss: 0.1989
Epoch 6/10
[1m7

In [3]:
import numpy as np
from sklearn.metrics import ndcg_score

print("\n--- Calculating Top-K Ranking Metrics ---")

# --- 1. Get Predictions for the Test Set ---
test_preds = model.predict(X_test).flatten()
eval_df = test_df.copy()
eval_df['prediction'] = test_preds

# --- 2. Set K ---
K = 10

# --- 3. Calculate metrics for each user and average ---
all_p_at_k = []
all_r_at_k = []
all_ndcg_at_k = []

user_groups = eval_df.groupby('user_id')

for user_id, group in user_groups:

    # --- FIX: Skip users with 1 or fewer items in the test set ---
    # Ranking metrics are meaningless for a single item.
    if len(group) <= 1:
        continue
    # --- END FIX ---

    # Get all "true" positive items for this user
    all_true_positives = group[group['shortlisted'] == 1]
    total_positives_for_user = len(all_true_positives)

    # If the user has no positive items in the test set, we can't evaluate them
    if total_positives_for_user == 0:
        continue

    # --- Get Top K Recommendations ---
    top_k_recs = group.sort_values('prediction', ascending=False).head(K)
    true_labels_in_top_k = top_k_recs['shortlisted'].values

    # --- Calculate Hits ---
    hits_at_k = np.sum(true_labels_in_top_k)

    # --- Calculate Precision@K ---
    p_at_k = hits_at_k / K
    all_p_at_k.append(p_at_k)

    # --- Calculate Recall@K ---
    r_at_k = hits_at_k / total_positives_for_user
    all_r_at_k.append(r_at_k)

    # --- Calculate NDCG@K ---
    true_relevance = group['shortlisted'].values.reshape(1, -1)
    predicted_scores = group['prediction'].values.reshape(1, -1)

    ndcg_at_k = ndcg_score(true_relevance, predicted_scores, k=K)
    all_ndcg_at_k.append(ndcg_at_k)

# --- 4. Average the scores across all users ---
mean_p_at_k = np.mean(all_p_at_k)
mean_r_at_k = np.mean(all_r_at_k)
mean_ndcg_at_k = np.mean(all_ndcg_at_k)

print(f"Mean Precision@{K}: {mean_p_at_k:.4f}")
print(f"Mean Recall@{K}: {mean_r_at_k:.4f}")
print(f"Mean NDCG@{K}: {mean_ndcg_at_k:.4f}")


--- Calculating Top-K Ranking Metrics ---
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Mean Precision@10: 0.1000
Mean Recall@10: 1.0000
Mean NDCG@10: 0.9978
