# This notebook Generates the user and job embeddings which is GPU Dependant (CUDA)

In [1]:

import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.preprocessing import normalize
import torch
import ast

user_data_path = r"C:\Yousuf\DEPI\Technical\Mega Projects\Mega_Project\DEPI-Mega-project-1\AI_Job_Recommendation_System\data\Processed\cleaned_data\skill_list.csv"
job_data_path = r"C:\Yousuf\DEPI\Technical\Mega Projects\Mega_Project\DEPI-Mega-project-1\AI_Job_Recommendation_System\data\Processed\cleaned_data\job_cleaned.csv" 

  from .autonotebook import tqdm as notebook_tqdm


### Must have cuda installed separeatly to run on the GPU (Not in requirement.txt not used in production)

In [2]:
print("torch version:", torch.__version__)
print("torch CUDA available?", torch.cuda.is_available())
print("torch CUDA version:", torch.version.cuda)

torch version: 2.6.0+cu118
torch CUDA available? True
torch CUDA version: 11.8


In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

def load_data(file_path, data_type='user'):
    """
    Load data from a CSV file.

    Parameters:
    - file_path (str): Path to the CSV file.
    - data_type (str): Type of data ('user' for user skills, 'job' for job listings).

    Returns:
    - List[List[str]] for user skills or List[str] for job listings.
    """
    df = pd.read_csv(file_path)
    if 'Unnamed: 0' in df.columns:
        df.drop(['Unnamed: 0'], axis=1, inplace=True)
    
    if data_type == 'user':
        # Assuming the first column contains string representations of lists
        data = df.iloc[:, 0].apply(ast.literal_eval).tolist()
    elif data_type == 'job':
        # Assuming there's a 'text' column with job descriptions
        data = df['text'].tolist()
    else:
        raise ValueError("Invalid data_type. Choose 'user' or 'job'.")
    
    return data

In [4]:
def generate_embeddings(data, model_name='intfloat/e5-small-v2', device='cuda'):
    """
    Generate embeddings for the given data using the specified model.

    Parameters:
    - data (List[List[str]] or List[str]): Data to encode. For users, list of lists of skills. For jobs, list of job descriptions.
    - model_name (str): Name of the SentenceTransformer model to use.
    - device (str): Device to use for computation ('cuda' or 'cpu').

    Returns:
    - np.ndarray: Array of embeddings.
    """
    model = SentenceTransformer(model_name, device=device)
    dim = model.get_sentence_embedding_dimension()
    
    if not data:
        return np.array([])
    
    if isinstance(data[0], list):
        # User skills: list of lists
        embeddings = []
        for skills in data:
            if skills:
                embs = model.encode(skills, convert_to_numpy=True, show_progress_bar=False)
                embeddings.append(embs.mean(axis=0))
            else:
                embeddings.append(np.zeros(dim))
        embeddings = np.vstack(embeddings)
    elif isinstance(data[0], str):
        # Job descriptions: list of strings
        embeddings = model.encode(data, convert_to_numpy=True, show_progress_bar=False)
    else:
        raise ValueError("Invalid data format. Expected list of lists or list of strings.")
    
    return embeddings

In [5]:
def save_embeddings(embeddings, save_path):
    """
    Save the embeddings to a numpy file.

    Parameters:
    - embeddings (np.ndarray): Embeddings to save.
    - save_path (str): Path to save the embeddings.
    """
    np.save(save_path, embeddings)

In [12]:
def generate_and_save_embeddings(model_name, user_data_path, job_data_path, user_save_path, job_save_path, device='cuda'):
    """
    Generate and save embeddings for user skills and job listings.

    Parameters:
    - model_name (str): Name of the SentenceTransformer model to use.
    - user_data_path (str): Path to the user skills CSV file.
    - job_data_path (str): Path to the job listings CSV file.
    - user_save_path (str): Path to save the user embeddings.
    - job_save_path (str): Path to save the job embeddings.
    - device (str): Device to use for computation ('cuda' or 'cpu').
    """
    # Load user skills
    user_skills = load_data(user_data_path, data_type='user')
    # Generate user embeddings
    user_embeddings = generate_embeddings(user_skills, model_name, device)
    # Save user embeddings
    save_embeddings(user_embeddings, user_save_path)
    
    # Load job listings
    job_listings = load_data(job_data_path, data_type='job')
    # Generate job embeddings
    job_embeddings = generate_embeddings(job_listings, model_name, device)
    # Save job embeddings
    save_embeddings(job_embeddings, job_save_path)


In [11]:


model_name1 = 'intfloat/e5-small-v2'

user_save_path = r"C:\Yousuf\DEPI\Technical\Mega Projects\Mega_Project\DEPI-Mega-project-1\AI_Job_Recommendation_System\data\Processed\Embeddings\e5_small\user_profile_embeddings_e5_small.npy"
job_save_path = r"C:\Yousuf\DEPI\Technical\Mega Projects\Mega_Project\DEPI-Mega-project-1\AI_Job_Recommendation_System\data\Processed\Embeddings\e5_small\job_embeddings_e5_small.npy"

generate_and_save_embeddings(model_name1, user_data_path, job_data_path, user_save_path, job_save_path)

In [13]:
model_name = 'all-MiniLM-L6-v2'
job_save_path = r"C:\Yousuf\DEPI\Technical\Mega Projects\Mega_Project\DEPI-Mega-project-1\AI_Job_Recommendation_System\data\Processed\Embeddings\sbert\job_sbert_embedding_norm.npy"
user_save_path = r"C:\Yousuf\DEPI\Technical\Mega Projects\Mega_Project\DEPI-Mega-project-1\AI_Job_Recommendation_System\data\Processed\Embeddings\sbert\user_sbert_embeddings.npy"

generate_and_save_embeddings(model_name, user_data_path, job_data_path, user_save_path, job_save_path)