Step 1: Synthetic Data Generation

In [9]:
import random
import pandas as pd

# Parameters for synthetic data
NUM_USERS = 1000
NUM_JOBS = 500
NUM_INTERACTIONS = 10000
SKILLS = ['Python', 'Java', 'SQL', 'Machine Learning', 'Data Analysis', 'Web Development', 'Cloud Computing', 'C++', 'Excel']

# Generate users
users = pd.DataFrame({
    'user_id': range(1, NUM_USERS + 1),
    'name': [f'User_{i}' for i in range(1, NUM_USERS + 1)],
    'skills': [', '.join(random.sample(SKILLS, random.randint(2, 5))) for _ in range(NUM_USERS)]
})

# Generate jobs
jobs = pd.DataFrame({
    'job_id': range(1, NUM_JOBS + 1),
    'title': [random.choice(['Data Scientist', 'Software Engineer', 'Analyst', 'Web Developer']) for _ in range(NUM_JOBS)],
    'skills': [', '.join(random.sample(SKILLS, random.randint(2, 5))) for _ in range(NUM_JOBS)],
    'location': [random.choice(['New York', 'San Francisco', 'Austin', 'Toronto', 'London']) for _ in range(NUM_JOBS)],
    'salary_range': [f'{random.randint(50, 100)}k-{random.randint(100, 200)}k' for _ in range(NUM_JOBS)]
})

# Generate interactions
interactions = pd.DataFrame({
    'user_id': [random.randint(1, NUM_USERS) for _ in range(NUM_INTERACTIONS)],
    'job_id': [random.randint(1, NUM_JOBS) for _ in range(NUM_INTERACTIONS)],
    'action_type': [random.choice(['view', 'apply', 'save']) for _ in range(NUM_INTERACTIONS)],
})

# Save data (optional)
users.to_csv('users.csv', index=False)
jobs.to_csv('jobs.csv', index=False)
interactions.to_csv('interactions.csv', index=False)


Step 2: Content-Based Recommendation Model


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# TF-IDF vectorizer for skills
vectorizer = TfidfVectorizer()

# Vectorize job and user skills
job_vectors = vectorizer.fit_transform(jobs['skills'])
user_vectors = vectorizer.transform(users['skills'])

# Compute similarity between users and jobs
similarity_matrix = cosine_similarity(user_vectors, job_vectors)

# Recommendation function
def recommend_jobs(user_id, similarity_matrix, jobs_df, top_n=5):
    user_idx = users[users['user_id'] == user_id].index[0]
    similar_jobs_indices = similarity_matrix[user_idx].argsort()[-top_n:][::-1]
    return jobs_df.iloc[similar_jobs_indices][['job_id', 'title', 'skills', 'location', 'salary_range']]


# Test for a specific user
user_id = 1

# Get recommendations
recommendations = recommend_jobs(user_id, similarity_matrix, jobs, top_n=5)

# Display recommendations
print(f"Top job recommendations for User {user_id}:\n", recommendations)


Top job recommendations for User 1:
      job_id              title                            skills  \
469     470     Data Scientist         SQL, Cloud Computing, C++   
414     415            Analyst              Cloud Computing, SQL   
7         8            Analyst         Cloud Computing, SQL, C++   
477     478            Analyst       SQL, Excel, Cloud Computing   
115     116  Software Engineer  Excel, Cloud Computing, C++, SQL   

          location salary_range  
469         Austin     76k-127k  
414  San Francisco     58k-179k  
7           London     51k-183k  
477         Austin     67k-126k  
115  San Francisco     69k-153k  


In [3]:
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Parameters for synthetic data
NUM_USERS = 1000
NUM_JOBS = 500
NUM_INTERACTIONS = 10000
SKILLS = ['Python', 'Java', 'SQL', 'Machine Learning', 'Data Analysis', 'Web Development', 
          'Cloud Computing', 'C++', 'Excel']
LOCATIONS = ['New York', 'San Francisco', 'Austin', 'Toronto', 'London']
COMPANIES = ['Google', 'Amazon', 'Microsoft', 'Facebook', 'Apple']

# Generate users
users = pd.DataFrame({
    'user_id': range(1, NUM_USERS + 1),
    'name': [f'User_{i}' for i in range(1, NUM_USERS + 1)],
    'skills': [', '.join(random.sample(SKILLS, random.randint(2, 5))) for _ in range(NUM_USERS)],
    'preferred_location': [random.choice(LOCATIONS) for _ in range(NUM_USERS)]
})

# Generate jobs
jobs = pd.DataFrame({
    'job_id': range(1, NUM_JOBS + 1),
    'title': [random.choice(['Data Scientist', 'Software Engineer', 'Analyst', 'Web Developer']) for _ in range(NUM_JOBS)],
    'skills': [', '.join(random.sample(SKILLS, random.randint(2, 5))) for _ in range(NUM_JOBS)],
    'location': [random.choice(LOCATIONS) for _ in range(NUM_JOBS)],
    'company': [random.choice(COMPANIES) for _ in range(NUM_JOBS)],
    'salary_range': [f'{random.randint(50, 100)}k-{random.randint(100, 200)}k' for _ in range(NUM_JOBS)]
})

# Combine features for vectorization
jobs['combined_features'] = jobs['skills'] + ' ' + jobs['location'] + ' ' + jobs['company']
users['combined_features'] = users['skills'] + ' ' + users['preferred_location']

# TF-IDF vectorizer for combined features
vectorizer = TfidfVectorizer()

# Vectorize job and user combined features
job_vectors = vectorizer.fit_transform(jobs['combined_features'])
user_vectors = vectorizer.transform(users['combined_features'])

# Compute similarity between users and jobs
similarity_matrix = cosine_similarity(user_vectors, job_vectors)

# Recommendation function
def recommend_jobs(user_id, similarity_matrix, jobs_df, top_n=5):
    """
    Recommend top N jobs for a given user based on the similarity matrix.

    Args:
    user_id (int): The ID of the user.
    similarity_matrix (ndarray): The similarity matrix between users and jobs.
    jobs_df (DataFrame): The DataFrame containing job details.
    top_n (int): The number of recommendations to return.

    Returns:
    DataFrame: The top N recommended jobs for the user.
    """
    user_idx = users[users['user_id'] == user_id].index[0]
    similar_jobs_indices = similarity_matrix[user_idx].argsort()[-top_n:][::-1]
    return jobs_df.iloc[similar_jobs_indices][['job_id', 'title', 'skills', 'location', 'company', 'salary_range']]

# Test for a specific user
user_id = 1

# Get recommendations
recommendations = recommend_jobs(user_id, similarity_matrix, jobs, top_n=5)

# Display recommendations
print(f"Top job recommendations for User {user_id}:\n", recommendations)


Top job recommendations for User 1:
      job_id              title  \
331     332      Web Developer   
112     113      Web Developer   
356     357     Data Scientist   
401     402  Software Engineer   
86       87      Web Developer   

                                                skills location    company  \
331  Excel, Cloud Computing, Data Analysis, C++, Ma...  Toronto  Microsoft   
112  Data Analysis, C++, Cloud Computing, Machine L...  Toronto   Facebook   
356  Machine Learning, Data Analysis, SQL, Python, ...  Toronto     Amazon   
401  Excel, Machine Learning, Data Analysis, Cloud ...  Toronto     Amazon   
86   SQL, Data Analysis, Machine Learning, Cloud Co...  Toronto     Google   

    salary_range  
331     97k-139k  
112     74k-139k  
356     82k-144k  
401     82k-165k  
86      54k-105k  
