In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize

# --- Configuration ---
NUM_USERS = 10000
NUM_JOBS = 500
NUM_INTERACTIONS = 2000000  # ~1M positive, ~1M negative

# Define skill columns
SKILLS = ['C++', 'Python', 'Java', 'SQL', 'Excel', 'JavaScript', 'HTML', 'CSS', 'Machine Learning', 'Data Analysis', 'Project Management']

print(f"Synthesizing {NUM_USERS} users, {NUM_JOBS} jobs, and {NUM_INTERACTIONS} interactions...")

# --- 1. Synthesize users.csv ---
user_data = {
    'user_id': range(NUM_USERS),
    'experience': np.random.randint(0, 25, size=NUM_USERS),
    'salary_expectation': np.random.randint(40000, 150000, size=NUM_USERS)
}
# Generate random skill proficiencies (0-10)
user_skills = np.random.rand(NUM_USERS, len(SKILLS)) * 10
for i, skill in enumerate(SKILLS):
    user_data[f'{skill}_proficiency'] = user_skills[:, i]

users_df = pd.DataFrame(user_data)
users_df.to_csv('users.csv', index=False)
print("users.csv created.")

# --- 2. Synthesize jobs.csv ---
job_data = {
    'job_id': range(NUM_JOBS),
    'experience_required': np.random.randint(0, 15, size=NUM_JOBS),
}
# Generate salary bands
job_data['salary_min'] = np.random.randint(30000, 120000, size=NUM_JOBS)
job_data['salary_max'] = job_data['salary_min'] + np.random.randint(10000, 50000, size=NUM_JOBS)

# Generate skill expectations (0-10)
job_skills = np.random.rand(NUM_JOBS, len(SKILLS)) * 10
for i, skill in enumerate(SKILLS):
    job_data[f'{skill}_expected'] = job_skills[:, i]

jobs_df = pd.DataFrame(job_data)
jobs_df.to_csv('jobs.csv', index=False)
print("jobs.csv created.")


# --- 3. Synthesize interactions.csv (The "Ground Truth") ---
print("--- Part 3: Synthesizing Interactions ---")

# Get skill matrices (assuming users_df and jobs_df exist)
user_skill_matrix = users_df[[f'{s}_proficiency' for s in SKILLS]].values
job_skill_matrix = jobs_df[[f'{s}_expected' for s in SKILLS]].values

# Normalize for cosine similarity
user_skill_matrix_norm = normalize(user_skill_matrix)
job_skill_matrix_norm = normalize(job_skill_matrix)

# We will generate positive and negative samples in separate lists
interaction_data_pos = []
interaction_data_neg = []

# --- Generate positive samples (good matches) ---
print(f"Generating {NUM_INTERACTIONS // 2} positive samples...")
for _ in range(NUM_INTERACTIONS // 2):
    user_id = np.random.randint(0, NUM_USERS)
    user = users_df.iloc[user_id]
    user_skills_vec = user_skill_matrix_norm[user_id]

    skill_similarities = job_skill_matrix_norm.dot(user_skills_vec)
    top_k_indices = np.argsort(skill_similarities)[-10:] # Get top 10

    found_perfect_match = False
    best_skill_job_id = top_k_indices[-1] # Fallback

    for job_id in reversed(top_k_indices):
        job = jobs_df.iloc[job_id]
        exp_match = user['experience'] >= job['experience_required']
        sal_match = (user['salary_expectation'] >= job['salary_min']) and \
                    (user['salary_expectation'] <= job['salary_max'])

        if exp_match and sal_match:
            interaction_data_pos.append({'user_id': user_id, 'job_id': job_id, 'shortlisted': 1})
            found_perfect_match = True
            break

    if not found_perfect_match:
        interaction_data_pos.append({'user_id': user_id, 'job_id': best_skill_job_id, 'shortlisted': 1})

# --- Generate negative samples (random/bad matches) ---
print(f"Generating {NUM_INTERACTIONS // 2} negative samples...")
for _ in range(NUM_INTERACTIONS // 2):
    user_id = np.random.randint(0, NUM_USERS)
    job_id = np.random.randint(0, NUM_JOBS)

    user = users_df.iloc[user_id]
    job = jobs_df.iloc[job_id]

    if user['experience'] < job['experience_required']:
        shortlisted = 0
    elif user['salary_expectation'] > job['salary_max'] + 20000:
        shortlisted = 0
    else:
        shortlisted = 0

    interaction_data_neg.append({'user_id': user_id, 'job_id': job_id, 'shortlisted': shortlisted})

# --- NEW: Create a balanced 40/60 dataset ---
print("Balancing and cleaning dataset for 40/60 split...")

# Convert to DataFrames and drop duplicates from each list
pos_df = pd.DataFrame(interaction_data_pos).drop_duplicates(subset=['user_id', 'job_id'])
neg_df = pd.DataFrame(interaction_data_neg).drop_duplicates(subset=['user_id', 'job_id'])

# Make sure no positive samples are in the negative list
neg_df = neg_df.merge(pos_df[['user_id', 'job_id']], on=['user_id', 'job_id'], how='left', indicator=True)
neg_df = neg_df[neg_df['_merge'] == 'left_only'].drop(columns=['_merge'])

# We have clean pools. Now, let's balance to 40% positive (0.4) and 60% negative (0.6).
# Ratio is 0.4 / 0.6 = 2/3. We need 2 positive samples for every 3 negative ones.
pos_clean_count = len(pos_df)
neg_clean_count = len(neg_df)
print(f"Cleaned samples: {pos_clean_count} positive, {neg_clean_count} negative.")

# Let's find the limiting factor.
# We want N_pos = (2/3) * N_neg

# Case 1: Positives are the bottleneck.
# We use all available positives and find how many negatives we need.
required_neg_count = int(pos_clean_count * (0.6 / 0.4)) # or 1.5 * pos_clean_count

if neg_clean_count >= required_neg_count:
    print(f"Using all {pos_clean_count} positive samples, downsampling negatives...")
    final_pos_df = pos_df
    final_neg_df = neg_df.sample(required_neg_count)
else:
    # Case 2: Negatives are the bottleneck.
    # We use all available negatives and find how many positives we need.
    print(f"Using all {neg_clean_count} negative samples, downsampling positives...")
    required_pos_count = int(neg_clean_count * (0.4 / 0.6)) # or (2/3) * neg_clean_count

    final_neg_df = neg_df
    final_pos_df = pos_df.sample(required_pos_count)

# Combine and shuffle
interactions_df = pd.concat([final_pos_df, final_neg_df]).sample(frac=1).reset_index(drop=True)

# Save the final file
interactions_df.to_csv('interactions.csv', index=False)

print("interactions.csv created.")
print(f"Total interactions: {len(interactions_df)}")
print(f"Positive interactions: {interactions_df['shortlisted'].sum()} ({(interactions_df['shortlisted'].sum() / len(interactions_df) * 100):.1f}%)")

Synthesizing 10000 users, 500 jobs, and 2000000 interactions...
users.csv created.
jobs.csv created.
--- Part 3: Synthesizing Interactions ---
Generating 1000000 positive samples...
Generating 1000000 negative samples...
Balancing and cleaning dataset for 40/60 split...
Cleaned samples: 10000 positive, 904774 negative.
Using all 10000 positive samples, downsampling negatives...
interactions.csv created.
Total interactions: 24999
Positive interactions: 10000 (40.0%)
