In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchmetrics import Accuracy

# Data Encoding and Scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin

# Natural Language Processing(NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Word Embedding
import gensim
from gensim.models import Word2Vec

In [2]:
warnings.filterwarnings('ignore')

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [5]:
''' Columns to recheck '''
''' premium_plan, jobma_catcher_creation, 
'''

' premium_plan, jobma_catcher_creation, \n'

In [6]:
df = pd.read_csv('data_collection.csv')

In [7]:
''' to create tags '''
'''
jobma_catcher_company, org_type, jobma_catcher_indus, jobma_job_title, slug, jobma_job_functional_areas, jobma_job_keywords, 

'''

'\njobma_catcher_company, org_type, jobma_catcher_indus, jobma_job_title, slug, jobma_job_functional_areas, jobma_job_keywords, \n\n'

In [8]:
df.drop(['subscription_type_x',
         'premium_storage_y',
         'currency_val',
         'jobma_catcher_sub_accounts',
         'jobma_job_company_profile'], axis=1, inplace=True)

In [9]:
df['plan_type'].fillna('No', inplace=True)

In [10]:
df['is_unlimited'].fillna('No', inplace=True)

In [11]:
df.shape

(179549, 55)

In [12]:
''' This Function is to fill all missing values (if col is int then 0, if col is float then 0.0 and if col is object then 'Unkmown') '''

def fill_missing_values(df):
    for col in df.columns:
        if df[col].dtype == np.int64:
            df[col].fillna(0, inplace=True)
        elif df[col].dtype == np.float64:
            df[col].fillna(0.0, inplace=True)
        elif df[col].dtype == object:
            df[col].fillna('Unknown', inplace=True)

    return df

In [13]:
# for col in df.columns:
#     if df[col].dtype == np.int64:
#         df[col].fillna(0, inplace=True)
#     elif df[col].dtype == np.float64:
#         df[col].fillna(0.0, inplace=True)
#     elif df[col].dtype == object:
#         df[col].fillna('Unknown', inplace=True)

In [14]:
df.isnull().sum()

jobma_catcher_id                       0
credit_amount                      35886
wallet_amount                          0
plan_type                              0
is_unlimited                           0
premium_storage_x                      0
premium_plan                          28
subscription_amount                    0
credit_given                           0
payment_mode                          29
status                                 0
payment_status                       278
number_of_transactions                 0
jobma_catcher_company                  0
org_type                          175794
jobma_catcher_indus                 9814
jobma_catcher_title                    0
jobma_catcher_otype                 6773
jobma_catcher_creation                 0
jobma_catcher_type                     0
is_premium                             0
jobma_catcher_parent                   0
jobma_catcher_is_deleted               0
jobma_verified                         0
data_access     

In [15]:
def label_encoder(df):
    df = df.copy()
    label_encoders = {}

    for col in df.columns:
        if df[col].dtype == object:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    return df

In [16]:
df['tags'] = (
    df['jobma_catcher_company'].astype(str) + " "+
    df['org_type'].astype(str) + " "+
    df['jobma_catcher_indus'].astype(str) + " "+
    df['jobma_job_title'].astype(str) + " "+
    df['slug'].astype(str) + " "+
    df['jobma_job_functional_areas'].astype(str) + " "+
    df['jobma_job_keywords'].astype(str)
)

In [17]:
df.drop(['org_type',
        'jobma_catcher_indus',
        'jobma_job_title',
        'slug',
        'jobma_job_functional_areas',
        'jobma_job_keywords'], axis=1, inplace=True)

In [18]:
df.loc[1,'tags']

'Select Source International nan nan business partner training business-partner-training HR / Administration / IR Training, "Instructional Design",  "Curriculum design", "java'

In [19]:
df['tags'].shape

(179549,)

In [20]:
df['tags'].isnull().sum()

0

## Stemming
**Note: Use Lemmatization for more accuracy**

To normalize words and reduce them to their root forms, we will apply **stemming**. This helps in handling variations of words and improves text processing efficiency for machine learning models.  
(e.g., "running" → "run")

**Currently using Lemmatization**

In [23]:
lemmatizer = WordNetLemmatizer()

In [24]:
stop_words = set(stopwords.words('english'))

In [25]:
def formatting(text):
    if isinstance(text, pd.Series) or isinstance(text, list):
        text = " ".join(text)

    words = word_tokenize(text.lower())
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()]
    return " ".join(filtered_words)

In [26]:
''' Applying Stopwords Removal and Lemmatization'''

# df['tags'] = df['tags'].apply(formatting)

' Applying Stopwords Removal and Lemmatization'

In [27]:
df.loc[1,'tags']

'Select Source International nan nan business partner training business-partner-training HR / Administration / IR Training, "Instructional Design",  "Curriculum design", "java'

In [28]:
def remove_duplicates(text):
    if isinstance(text,str):
        words = text.split()
        seen = set()
        unique_words = []

        for word in words:
            if word not in seen:
                seen.add(word)
                unique_words.append(word)
    
        return " ".join(unique_words)        

    return text

In [29]:
''' Removing Duplicates from tags '''

# df['tags'] = df['tags'].apply(remove_duplicates)

' Removing Duplicates from tags '

In [30]:
df.loc[1,'tags']

'Select Source International nan nan business partner training business-partner-training HR / Administration / IR Training, "Instructional Design",  "Curriculum design", "java'

In [31]:
df['tags'] = df['tags'].str.replace('/', '', regex=False)

In [32]:
df.loc[1,'tags']

'Select Source International nan nan business partner training business-partner-training HR  Administration  IR Training, "Instructional Design",  "Curriculum design", "java'

## Word Embeddings
**Note: Use Contextual Embeddings for More Accuracy**

To represent words in a numerical format while preserving their meaning and relationships, we will apply **word embeddings**. This helps in capturing semantic similarities and improving machine learning model performance.
(e.g., "king" → similar to "queen" but different from "apple").

In [34]:
# sentences = [tag.split() for tag in df['tags']]

In [35]:
# Train Word2Vec model
# w2v_model = Word2Vec(sentences, vector_size=16, window=5, min_count=1, workers=4)

In [36]:
# # Function to convert text into an average word embedding
# def text_to_embedding(text):
#     words = text.split()
#     embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
#     return np.mean(embeddings, axis=0) if embeddings else np.zeros(16)

In [37]:
# text_embeddings = np.array([text_to_embedding(text) for text in df['tags']])
# text_embeddings_df = pd.DataFrame(text_embeddings, columns=[f'emb_{i}' for i in range(16)])

In [38]:
# df = df.join(text_embeddings_df, how='inner')

In [39]:
def word_embeddings(df, text_col='tags', vector_size=16, window=5, min_count=1, workers=4):

    # sentences = [text.split() for text in df[text_col].astype(str)]
    sentences = [text.split() for text in df[text_col]]

    w2v_model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)

    def text_to_embedding(text):
        words = text.split()
        embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
        return np.mean(embeddings, axis=0) if embeddings else np.zeros(vector_size)

    # text_embeddings = np.array([text_to_embedding(text) for text in df[text_col].astype(str)])
    text_embeddings = np.array([text_to_embedding(text) for text in df[text_col]]) 

    embeddings_cols = [f'{text_col}_emb-{i}' for i in range(vector_size)]
    text_embeddings_df = pd.DataFrame(text_embeddings, columns=embeddings_cols)

    df = df.reset_index(drop=True).join(text_embeddings_df, how='inner')

    return df

In [40]:
# df.drop('tags', axis=1, inplace=True)

# Data Preprocessing

# Pipeline

In [43]:
def formatting_series(X):
    formatted_text = X.apply(lambda x: formatting(x) if isinstance(x, str) else "")
    return formatted_text.values.reshape(-1, 1)

In [None]:
def formatting_series(X):
    formatted_text = X.apply(lambda x: formatting(x) if isinstance(x, str) else "")
    return formatted_text.values.reshape(-1, 1)

In [44]:
def remove_duplicates_series(X):
    # Process the text and return it as a 2D array (n_samples, 1)
    return pd.Series(X.apply(lambda x: remove_duplicates(x) if isinstance(x, str) else "").values, dtype=str).to_frame()

In [45]:
# preprocessing_pipeline = Pipeline([
#     ('fill_missing_values', FunctionTransformer(fill_missing_values, validate=False)),
#     ('label_encoder', FunctionTransformer(label_encoder, validate=False)),
#     ('scaler', StandardScaler()),
#     # ('pca', PCA(n_components=0.95)
#     # ('formatting', FunctionTransformer(formatting, validate=False)),
#     # ('remove_duplicates', FunctionTransformer(remove_duplicates, validate=False))
# ])

In [46]:
numerical_pipeline = Pipeline([
    ('fill_missing_values', FunctionTransformer(fill_missing_values, validate=False)),
    ('label_encoder', FunctionTransformer(label_encoder, validate=False)),
    ('scaler', StandardScaler()),
    # ('pca', PCA(n_components=0.95))
])

In [47]:
text_pipeline = Pipeline([
    ('fill_missing_values', FunctionTransformer(fill_missing_values, validate=False)),
    ('formatting', FunctionTransformer(formatting_series, validate=False)),
    # ('remove_duplicates', FunctionTransformer(remove_duplicates_series, validate=False)),
    # ('word_embedding', FunctionTransformer(word_embeddings, validate=False))
])

In [48]:
''' Combining both numerical_pipeline and text_pipeline '''
preprocessing_pipeline = ColumnTransformer(transformers=[
    ('text', text_pipeline, ['tags']),
    ('tabular', numerical_pipeline, [col for col in df.columns if col != 'tags'])
], remainder='passthrough')

In [49]:
''' Visualize the Pipeline '''

set_config(display='diagram')
preprocessing_pipeline

# Data Splitting

In [51]:
X = df

In [52]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [53]:
print(f'Length of X_train: {len(X_train)}')
print(f'Length of X_test: {len(X_test)}')

Length of X_train: 143639
Length of X_test: 35910


In [54]:
type(X_train), type(X_test)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

# Fit Pipeline

In [56]:
X_train.shape, X_test.shape

((143639, 50), (35910, 50))

In [57]:
print("Original X_train shape:", X_train.shape)

text_out = text_pipeline.fit_transform(X_train[['tags']])
print("Text pipeline output shape:", text_out.shape)

num_out = numerical_pipeline.fit_transform(X_train.drop(columns=['tags']))
print("Numerical pipeline output shape:", num_out.shape)

Original X_train shape: (143639, 50)
Text pipeline output shape: (1, 1)
Numerical pipeline output shape: (143639, 49)


In [58]:
# formatted_output = formatting_series(X_train['tags'])
# print(formatted_output.shape)

In [59]:
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
X_test_transformed =  preprocessing_pipeline.transform(X_test)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 143639

In [None]:
X_train.shape

In [None]:
type(X_train)

In [None]:
''' Convert into Tensors '''

X_train_tensor = torch.tensor(X_train_transformed, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_transformed, dtype=torch.float32)

In [None]:
type(X_train_tensor), type(X_test_tensor)

In [None]:
len(X_train_tensor), len(X_test_tensor)

# Dataset Class

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
train_data = CustomDataset(X_train_tensor)
test_data = CustomDataset(X_test_tensor)

# Dataloader

In [None]:
BATCH_SIZE = 16

In [None]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
len(train_data), len(test_data)

In [None]:
len(train_dataloader), len(test_dataloader)

# Define a Model

In [None]:
class Client_Recommendation_Model(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        ''' Encoder (Compression) '''
        # Shrinks job data into a small hidden representation (like a summary).
        self.encoder = nn.Sequential(
            nn.Linear(input_shape, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )

        ''' Decoder (Reconstruction) '''
        # Tries to rebuild the original job data from that compressed version.
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_shape),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
X_train.shape[1]

In [None]:
''' Initializing the model '''
input_shape = X_train.shape[1]

model_1 = Client_Recommendation_Model(input_shape)
model_1.to(device)

In [None]:
''' Important Parameters '''

learning_rate = 0.001
epochs = 20

In [None]:
''' Loss Function and Optimizer '''
cosine_loss_function = nn.CosineEmbeddingLoss()
mse_loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model_1.parameters(), lr=learning_rate)

# Training Loop

In [None]:
def train_step(model:torch.nn.Module,
               dataloader:torch.utils.data.DataLoader,
               mse_loss_function:torch.nn.Module,
               cosine_loss_function:torch.nn.Module,
               optimizer:torch.optim.Optimizer,
              device:torch.device):
    
    model.train()
    model.to(device)
    epoch_total_loss = 0

    for batch_X in dataloader:
        batch_X = batch_X.to(device)
        encoded, decoded = model(batch_X)  # Forward pass

        ''' Compute Loss '''
        mse_loss = mse_loss_function(decoded, batch_X)  # Reconstruction Loss

        batch_size = encoded.shape[0]
        target_labels = torch.ones(batch_size, device=device)

        permuted_indices = torch.randperm(batch_size, device=device)
        encoded_shuffled = encoded[permuted_indices]

        cosine_loss = cosine_loss_function(encoded, encoded_shuffled, target_labels) # Similarity Loss
        total_loss = mse_loss + cosine_loss

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        epoch_total_loss += total_loss.item()

    training_loss = epoch_total_loss / len(dataloader)
    return training_loss

# Testing Loop

In [None]:
def test_step(model:torch.nn.Module,
              dataloader:torch.utils.data.DataLoader,
              mse_loss_function:torch.nn.Module,
              cosine_loss_function:torch.nn.Module,
              device:torch.device
             ):

    epoch_total_loss = 0
    model.to(device)
    model.eval()
    with torch.inference_mode():
        for batch_X in dataloader:
            batch_X = batch_X.to(device)
            encoded, decoded = model(batch_X)  # Forward pass

            ''' Compute Loss '''
            mse_loss = mse_loss_function(decoded, batch_X)  # Reconstruction Loss
    
            batch_size = encoded.shape[0]
            target_labels = torch.ones(batch_size, device=device)

            # Compare each encoded job to another shuffled job
            permuted_indices = torch.randperm(batch_size, device=device)
            encoded_shuffled = encoded[permuted_indices]

            cosine_loss = cosine_loss_function(encoded, encoded_shuffled, target_labels)
            total_loss = mse_loss + cosine_loss
            
            epoch_total_loss += total_loss.item()

        testing_loss = epoch_total_loss / len(dataloader)
        return testing_loss

# Combining Training and Testing Loop into evaluate()

In [None]:
# def evaluate(model:torch.nn.Module,
#              train_dataloader:torch.utils.data.DataLoader,
#              test_dataloader:torch.utils.data.DataLoader,
#              mse_loss_function:torch.nn.Module,
#              cosine_loss_function:torch.nn.Module,
#              device:torch.device,
#              epochs:int = 5
#             ):
    
#     results = {'training_loss':[],
#               'testing_loss':[]}

#     for epoch in range(epochs):
#         training_loss = train_step(model=model,
#                                    dataloader=train_dataloader,
#                                    mse_loss_function=mse_loss_function,
#                                    cosine_loss_function=cosine_loss_function,
#                                    optimizer=optimizer,
#                                    device=device
#                                   )

#         testing_loss = test_step(model=model,
#                                    dataloader=test_dataloader,
#                                    mse_loss_function=mse_loss_function,
#                                    cosine_loss_function=cosine_loss_function,
#                                  device=device
#                                   )

#         results['training_loss'].append(training_loss)
#         results['testing_loss'].append(testing_loss)

#         print(f'Epoch {epoch+1}/{epochs} | Training Loss: {training_loss:.5f} | Testing Loss: {testing_loss:.5f}')

#     return results

In [None]:
def evaluate(model:torch.nn.Module,
             train_dataloader:torch.utils.data.DataLoader,
             test_dataloader:torch.utils.data.DataLoader,
             mse_loss_function:torch.nn.Module,
             cosine_loss_function:torch.nn.Module,
             device:torch.device,
             epochs:int = 5,
             patience: int = 5
            ):
    
    results = {'training_loss':[],
              'testing_loss':[]}

    best_loss = float('inf')
    counter = 0

    for epoch in range(epochs):
        training_loss = train_step(model=model,
                                   dataloader=train_dataloader,
                                   mse_loss_function=mse_loss_function,
                                   cosine_loss_function=cosine_loss_function,
                                   optimizer=optimizer,
                                   device=device
                                  )

        testing_loss = test_step(model=model,
                                   dataloader=test_dataloader,
                                   mse_loss_function=mse_loss_function,
                                   cosine_loss_function=cosine_loss_function,
                                 device=device
                                  )

        results['training_loss'].append(training_loss)
        results['testing_loss'].append(testing_loss)

        print(f'Epoch {epoch+1}/{epochs} | Training Loss: {training_loss:.5f} | Testing Loss: {testing_loss:.5f}')

        if testing_loss < best_loss:
            best_loss = testing_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early Stopping triggered at epoch {epoch+1}")
                break

    return results

In [None]:
torch.manual_seed(42)

model_1_results = evaluate(model=model_1,
                        train_dataloader=train_dataloader,
                        test_dataloader=test_dataloader,
                        mse_loss_function=mse_loss_function,
                        cosine_loss_function=cosine_loss_function,
                        device=device,
                        epochs=epochs,
                        patience=3)

# Loss and Accuracy Curves

In [None]:
epochs = range(len(model_1_results['training_loss']))

plt.figure(figsize=(8, 5))
plt.plot(epochs, model_1_results['training_loss'], label='Training Loss')
plt.plot(epochs, model_1_results['testing_loss'], label='Testing Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training VS Testing Loss')
plt.legend()
plt.show()

In [None]:
X_train['tags']

In [None]:
;klnbhjgvcfds

# Recommending 5 Top Clients (Based on Loyalty)

In [None]:
company_pref = {
    'Age' : 22
}

In [None]:
company_pref_df = pd.DataFrame([company_pref])
type(company_pref_df)

In [None]:
company_pref_transformed = preprocessing_pipeline.transform(company_pref_df)
type(company_pref_transformed)

In [None]:
company_pref_tensor = torch.tensor(company_pref_transformed, torch.float32)
type(company_pref_tensor)

In [None]:
company_pref_tensor.shape, X_test_tensor.shape