In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Data Encoding and Scaling
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [2]:
warnings.filterwarnings('ignore')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
df = pd.read_csv('test_data_processed.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'test_data_processed.csv'

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
kjhugvfycdtxrsz

In [None]:
''' Filled all the null values before encoding the data '''

for col in df.columns:
    if df[col].dtype == np.int64:
        df[col].fillna(0, inplace=True)
    elif df[col].dtype == np.float64:
        df[col].fillna(0.0, inplace=True)
    elif df[col].dtype == object:
        unique_vals = df[col].dropna().unique()
        if set(unique_vals).issubset({'Yes', 'No'}):
            df[col].fillna('No', inplace=True)
        else:
            df[col].fillna('Unknown', inplace=True)

In [None]:
''' This Function is to fill all missing values (if col is int then 0, if col is float then 0.0 and if col is object then 'Unkmown') '''

def fill_missing_values(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == np.int64:
            df[col].fillna(0, inplace=True)
        elif df[col].dtype == np.float64:
            df[col].fillna(0.0, inplace=True)
        elif df[col].dtype == object:
            unique_vals = df[col].dropna().unique()
            if set(unique_vals).issubset({'Yes', 'No'}):
                df[col].fillna('No', inplace=True)
            else:
                df[col].fillna('Unknown', inplace=True)
    return df


# Data Encoding

In [None]:
label_col = ['jobma_catcher_industry',]

def label_encoder(df):
    for col in label_col:
        if col in df.columns:
            if df[col].dtype == object or df[col].dtype.name == 'category':
                le_col = LabelEncoder()
                df[col] = le_col.fit_transform(df[col].astype(str))
    return df

In [None]:
def ordinal_encoder(df):
    ordinal_col = ['jobma_catcher_is_deleted', 'company_size', 'plan_type', 'is_unlimited']
    
    is_deleted_order = ['Yes', 'No']
    company_size_order = ['Unknown', '1-25', '26-100', '101-500', '500-1000', 'More than 1000']
    plan_type_order = ['Unknown', 'No', 'Yes']
    is_unlimited_order = ['Unknown', 'No', 'Yes']

    ordinal_order = [is_deleted_order, company_size_order, plan_type_order, is_unlimited_order]

    ordinal = OrdinalEncoder(categories=ordinal_order)
    encoded = ordinal.fit_transform(df[ordinal_col].astype(str))
    encoded_df = pd.DataFrame(encoded, columns=[f'{col}_ord' for col in ordinal_col], index=df.index)

    df.drop(columns=ordinal_col, inplace=True)

    df = pd.concat([df, encoded_df], axis=1)
    
    return df

# Log Transformation

In [None]:
log_cols = [
    'wallet_amount',
    'subscription_amount',
    'number_of_invitations',
    'job_posted',
    'number_of_kits',
    'interview_question',
    'pre_recorded_credit',
    'live_interview_credit'
]

In [None]:
def log_transform(df):
    df = df.copy()
    for col in log_cols:
        if col in df.columns:
            # fill NaNs
            df[col] = df[col].fillna(0)
            # if a number is less than zero, turn it into zero;
            df[col] = df[col].clip(lower=0)
            # safe log1p
            df[col] = np.log1p(df[col])
    return df

# Pipeline

In [None]:
pipeline = Pipeline([
    # ('fill_missing_values', FunctionTransformer(fill_missing_values, validate=False)),
    ('label_encoder', FunctionTransformer(label_encoder, validate=False)),
    ('ordinal_encoder', FunctionTransformer(ordinal_encoder, validate=False)),
    ('log_transformation', FunctionTransformer(log_transform, validate=False)), # log transforms the col with outliers
    ('scaler', StandardScaler()), # Needs attention to improve model
    # ('pca', PCA(n_components=0.97))
])

In [None]:
''' Visualizing the Pipeline '''
set_config(display='diagram')
pipeline

# Data Splitting

In [None]:
X = df

In [None]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
type(X_train), type(X_test)

In [None]:
len(X_train), len(X_test)

# Fit Pipeline

In [None]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [None]:
type(X_train_transformed), type(X_test_transformed)

In [None]:
len(X_train_transformed), len(X_test_transformed)

In [None]:
features_names = df.columns

In [None]:
X_df = pd.DataFrame(X_train_transformed, columns=features_names)
# X'_df = pd.DataFrame(X_train_transformed)

In [None]:
X_df.shape

# Convert into Tensor

In [None]:
X_train_tensor = torch.tensor(X_train_transformed, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_transformed, dtype=torch.float32)

In [None]:
type(X_train_tensor), type(X_test_tensor)

# Dataset Class

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
train_data = CustomDataset(X_train_tensor)
test_data = CustomDataset(X_test_tensor)

# DataLoader

In [None]:
BATCH_SIZE = 16

In [None]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
len(train_data), len(test_data)

In [None]:
len(train_dataloader), len(test_dataloader)

# Define a Model (AutoEncoder in this case)

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, input_shape):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.ReLU(),
            # nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            # nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32)  # bottleneck
        )

        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_shape)
        )

    def forward(self, X):
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
''' Initializing the Model '''

input_shape = X_df.shape[1]
model_1 = AutoEncoder(input_shape)
model_1.to(device)

In [None]:
''' Important Parameters '''
learning_rate = 1e-4
epochs = 50

In [None]:
''' Loss Function and Optimizers '''

mse_loss_function = nn.MSELoss()
optimizer = torch.optim.AdamW(model_1.parameters(), lr=learning_rate, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, verbose=True)

# Training Loop

In [None]:
def train_step(model:torch.nn.Module,
               dataloader:torch.utils.data.DataLoader,
               mse_loss_function:torch.nn.Module,
               optimizer:torch.optim.Optimizer
              ):
    model.train()
    epoch_total_loss = 0

    for batch_X in dataloader:
        encoded, decoded = model(batch_X)

        ''' Compute Loss '''
        mse_loss = mse_loss_function(decoded, batch_X)

        ''' BackPropagation '''
        optimizer.zero_grad()
        mse_loss.backward()
        optimizer.step()

        epoch_total_loss = mse_loss.item()

    training_loss = epoch_total_loss / len(dataloader)
    return training_loss

# Testing Loop

In [None]:
def test_step(model:torch.nn.Module,
              dataloader:torch.utils.data.DataLoader,
              mse_loss_function:torch.nn.Module,
             ):

    epoch_total_loss = 0
    
    model.eval()
    with torch.inference_mode():
        for batch_X in dataloader:
            encoded, decoded = model(batch_X)  # Forward pass

            ''' Compute Loss '''
            mse_loss = mse_loss_function(decoded, batch_X)  # Reconstruction Loss
            
            epoch_total_loss += mse_loss.item()

        testing_loss = epoch_total_loss / len(dataloader)
        return testing_loss

## Combining Training and Testing Loop into evaluate()

In [None]:
def evaluate(model:torch.nn.Module,
             train_dataloader:torch.utils.data.DataLoader,
             test_dataloader:torch.utils.data.DataLoader,
             mse_loss_function:torch.nn.Module,
             optimizer:torch.optim.Optimizer,
             scheduler:torch.optim.lr_scheduler.ReduceLROnPlateau,
             epochs:int = 5,
             patience:int = 3,
             delta:float = 0.0001
            ):
    
    results = {'training_loss':[],
              'testing_loss':[]}

    best_testing_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(epochs):
        training_loss = train_step(model=model,
                                   dataloader=train_dataloader,
                                   mse_loss_function=mse_loss_function,
                                   optimizer=optimizer
                                  )

        testing_loss = test_step(model=model,
                                   dataloader=test_dataloader,
                                   mse_loss_function=mse_loss_function,
                                  )

        scheduler.step(testing_loss)
        results['training_loss'].append(training_loss)
        results['testing_loss'].append(testing_loss)

        print(f'Epoch {epoch+1}/{epochs} | Training Loss: {training_loss:.5f} | Testing Loss: {testing_loss:.5f}')

         # Early stopping: check if testing loss improved
        if testing_loss < best_testing_loss - delta:
            best_testing_loss = testing_loss
            epochs_without_improvement = 0  # Reset counter since we had improvement
        else:
            epochs_without_improvement += 1
        
        # If no improvement for 'patience' epochs, stop training early
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}!")
            break
        
        # LR Scheduler
        for param_group in optimizer.param_groups:
            print(f"Current Learning Rate: {param_group['lr']:.6f}")
    return results

In [None]:
results = evaluate(model=model_1,
                   train_dataloader=train_dataloader,
                   test_dataloader=test_dataloader,
                   mse_loss_function=mse_loss_function,
                   optimizer=optimizer,
                   scheduler=scheduler,
                   epochs=epochs,
                   patience=3,  # Stop after 3 epochs without improvement
                   delta=0.0001)

# Loss and Accuracy Curves

In [None]:
epochs = range(len(results['training_loss']))

plt.figure(figsize=(8, 5))
plt.plot(epochs, results['training_loss'], label='Training Loss')
plt.plot(epochs, results['testing_loss'], label='Testing Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training VS Testing Loss')
plt.legend()
plt.show()

## Test it on User's Preferences 

In [None]:
X_df.columns

In [None]:
test_df = pd.read_csv('test_data.csv')

In [None]:
user_pref = {'jobma_catcher_id':1111,
             'jobma_catcher_industry':'Unknown',
             'jobma_catcher_type':'1',
             'is_premium':'1',
             'jobma_catcher_sub_accounts':0,
             'jobma_catcher_is_deleted':'No',
             'jobma_verified':1,
             'subscription_status':2,
             'interview_rate':0.,
             'live_interview_credit':11.,
             'pre_recorded_credit':1.00000e+02,
             'credit_value':2,
             'interview_cost_type':1,
             'subscription_type_x':0,
             'jobma_support_rtc':2,
             'interview_question':20,
             'video_recording_suppport':2,
             'sing_up_canditate_after_apply':0,
             'currency':1,
             'company_size':'500-1000',
             'wallet_amount':8.000000e+01,
             'plan_type':'Yes',
             'is_unlimited':'Yes',
             'premium_storage':0.,
             'subscription_amount': 1.13500000e+04,
             'number_of_subscriptions':116.,
             'number_of_invitations':2.0000e+02,
             'job_posted':35.,
             'number_of_kits':39.
            }

In [None]:
original_df = pd.read_csv('recommend1.csv')

In [None]:
original_df = original_df[['jobma_catcher_id', 'jobma_catcher_company']]

In [None]:
def recommend(user_input, model, dataloader, df_original, original_df, pipeline, top_k=5):
    user_df = pd.DataFrame([user_input])
    user_input_transformed = pipeline.transform(user_df)
    user_input_tensor = torch.tensor(user_input_transformed, dtype=torch.float32)
    
    model.eval()
    with torch.no_grad():
        user_embedding, _ = model(user_input_tensor)
        item_embeddings_list = []
        for batch_X in dataloader:
            item_embedding, _ = model(batch_X)
            item_embeddings_list.append(item_embedding)

        item_embeddings = torch.cat(item_embeddings_list, dim=0)

        similarities = F.cosine_similarity(user_embedding, item_embeddings)
        top_indices = similarities.topk(top_k).indices.cpu().numpy()

    recommended = df_original.iloc[top_indices].copy()
    recommended['similarity'] = similarities[top_indices].cpu().numpy()

    merged_df = pd.merge(recommended, original_df[['jobma_catcher_id', 'jobma_catcher_company']], on='jobma_catcher_id', how='left')

    return merged_df[['jobma_catcher_id', 'jobma_catcher_company', 'company_size', 'is_premium', 'wallet_amount', 'similarity']]

In [None]:
result = recommend(user_pref, model_1, test_dataloader, test_df, original_df, pipeline, top_k=5)
print(result.to_string())

In [None]:
user_pref1 = {'jobma_catcher_id':1111,
             'jobma_catcher_industry':'Unknown',
             'jobma_catcher_type':'1',
             'is_premium':'1',
             'jobma_catcher_sub_accounts':0,
             'jobma_catcher_is_deleted':'No',
             'jobma_verified':1,
             'subscription_status':2,
             'interview_rate':0.,
             'live_interview_credit':11.,
             'pre_recorded_credit':1.00000e+02,
             'credit_value':2,
             'interview_cost_type':1,
             'subscription_type_x':0,
             'jobma_support_rtc':2,
             'interview_question':20,
             'video_recording_suppport':2,
             'sing_up_canditate_after_apply':0,
             'currency':1,
             'company_size':'More than 1000',
             'wallet_amount':8.000000e+01,
             'plan_type':'Yes',
             'is_unlimited':'Yes',
             'premium_storage':0.,
             'subscription_amount': 1.13500000e+04,
             'number_of_subscriptions':116.,
             'number_of_invitations':2.0000e+02,
             'job_posted':35.,
             'number_of_kits':139.
            }

In [None]:
result = recommend(user_pref1, model_1, test_dataloader, test_df, original_df, pipeline, top_k=5)
print(result.to_string())

In [None]:
user_pref_dormant = {'jobma_catcher_id':0,
             'jobma_catcher_industry':'Unknown',
             'jobma_catcher_type':'1',
             'is_premium':'1',
             'jobma_catcher_sub_accounts':0,
             'jobma_catcher_is_deleted':'No',
             'jobma_verified':1,
             'subscription_status':2,
             'interview_rate':0.,
             'live_interview_credit':11.,
             'pre_recorded_credit':1.00000e+02,
             'credit_value':2,
             'interview_cost_type':1,
             'subscription_type_x':0,
             'jobma_support_rtc':2,
             'interview_question':20,
             'video_recording_suppport':2,
             'sing_up_canditate_after_apply':0,
             'currency':1,
             'company_size':'More than 1000',
             'wallet_amount':0.000000e+00,
             'plan_type':'No',
             'is_unlimited':'No',
             'premium_storage':1.,
             'subscription_amount': 1.000000e+02,
             'number_of_subscriptions':1.,
             'number_of_invitations':1.,
             'job_posted':5.,
             'number_of_kits':1.
            }

In [None]:
result = recommend(user_pref_dormant, model_1, test_dataloader, test_df, original_df, pipeline, top_k=5)
print(result.to_string())

In [None]:
user_pref_asdfg = {'jobma_catcher_id':0,
             'jobma_catcher_industry':'Unknown',
             'jobma_catcher_type':'1',
             'is_premium':'0',
             'jobma_catcher_sub_accounts':0,
             'jobma_catcher_is_deleted':'No',
             'jobma_verified':0,
             'subscription_status':1,
             'interview_rate':0.,
             'live_interview_credit':11.,
             'pre_recorded_credit':1.00000e+02,
             'credit_value':2,
             'interview_cost_type':1,
             'subscription_type_x':0,
             'jobma_support_rtc':2,
             'interview_question':20,
             'video_recording_suppport':2,
             'sing_up_canditate_after_apply':0,
             'currency':1,
             'company_size':'More than 1000',
             'wallet_amount':1.000000e+00,
             'plan_type':'No',
             'is_unlimited':'No',
             'premium_storage':1.,
             'subscription_amount': 1.000000e+02,
             'number_of_subscriptions':1.,
             'number_of_invitations':1.,
             'job_posted':5.,
             'number_of_kits':1.
            }

In [None]:
result1 = recommend(user_pref_asdfg, model_1, test_dataloader, test_df, original_df, pipeline, top_k=5)
print(result1.to_string())