In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from sqlalchemy import create_engine

# Torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Data Encoding and Scaling
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Pipeline
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import set_config

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
warnings.filterwarnings('ignore')

In [3]:
engine = create_engine("mysql+pymysql://root:dushi%401611@localhost:3306/test_database")

In [4]:
catcher_df = pd.read_sql('Select * FROM jobma_catcher', con=engine)  # Done  (about catcher's account)
wallet_df = pd.read_sql('Select * FROM wallet', con=engine)  # Done  ('subscription type, plan type')
subscription_df = pd.read_sql('Select * FROM subscription_history', con=engine)  # Done  (number and amount of subscription bought by the catcher)
invitation_df = pd.read_sql('Select * FROM jobma_pitcher_invitations', con=engine)  # Done   (number of invitations sent by the catcher)
job_posting_df = pd.read_sql('Select * FROM jobma_employer_job_posting', con=engine) # Done    (number of jobs posted by the catcher)
kit_df = pd.read_sql('Select * FROM job_assessment_kit', con=engine) # Done   (number of kits created by the catcher)
recorded_interview_df = pd.read_sql('Select * FROM jobma_interviews', con=engine) # Done (number of recorded interviews created by the catcher)
live_interview_df = pd.read_sql('Select * FROM jobma_interviews_online', con=engine) # Done (number of live interviews created by the catcher)
login_df = pd.read_sql('Select * FROM jobma_login',con=engine)  # Done (To Find number of days catcher didn't logged in)

# Specific Methods

**To Fetch Columns from different tables and fitting those functions into Pipeline**

In [6]:
''' catcher_df '''

def fetching_catcher_df(df):
    sub_counts = catcher_df[catcher_df['jobma_catcher_parent'] != 0]['jobma_catcher_parent'].value_counts()
    parents_df = catcher_df[catcher_df['jobma_catcher_parent'] == 0].copy()
    parents_df['total_sub'] = parents_df['jobma_catcher_id'].map(sub_counts).fillna(0).astype(int)
    parents_df - parents_df[['jobma_catcher_id', 'is_premium', 'subscription_status', 'company_size', 'total_sub']]
    parents_df.drop(parents_df[parents_df['subscription_status'] == '0'].index, inplace=True)
    parents_df['is_premium'] = parents_df['is_premium'].replace({'0':0, '1':1})

    return parents_df

In [7]:
''' wallet_df '''

def fetching_wallet_df(df):
    wallet_df.rename(columns={'catcher_id': 'jobma_catcher_id'}, inplace=True)
    wallet_df = wallet_df[['jobma_catcher_id', 'is_unlimited']]
    wallet_df['is_unlimited'] = wallet_df['is_unlimited'].replace({'0':0, '1':1})
    wallet_df.drop(wallet_df[wallet_df['is_unlimited'] == ''].index, inplace=True)
    wallet_df.drop_duplicates(inplace=True)

    return wallet_df

In [8]:
''' subscription_df '''

def fetching_subscription_df(df):
    subscription_df.rename(columns={'catcher_id': 'jobma_catcher_id'}, inplace=True)
    subscription_df.loc[subscription_df['currency'] == '1', 'subscription_amount'] /= 85.23
    subscription_df = subscription_df.groupby('jobma_catcher_id').agg(
        subscription_amount_in_dollars = ('subscription_amount', 'sum'),
        number_of_subscriptions = ('subscription_amount', 'count'),
    ).reset_index()
    subscription_df.drop_duplicates(inplace=True)

    return subscription_df

In [9]:
''' login_df '''

def fetching_login_df(df):
    login_df = login_df[login_df['jobma_role_id'] == 3].copy()
    login_df.rename(columns={'jobma_user_id': 'jobma_catcher_id'}, inplace=True)

    # Calculating Number of Gaps between last login and today
    login_df['jobma_last_login'] = pd.to_datetime(login_df['jobma_last_login'], errors='coerce')
    login_df['days_since_last_login'] = (pd.Timestamp('today') - login_df['jobma_last_login']).dt.days
    login_df['days_since_last_login'].fillna(9999, inplace=True)
    login_df['days_since_last_login'] = login_df['days_since_last_login'].astype(int)

    # Binning
    bins = [-1,7,30,90,180,365,float('inf')]
    labels = ['Less than 1 Week', '1-4 Weeks', '1-3 Months', '3-6 Months', '6-12 Months', 'More than 1 Year']
    login_df['days_since_last_login'] = pd.cut(login_df['days_since_last_login'], bins=bins, labels=labels)
    login_df = login_df[['jobma_catcher_id', 'days_since_last_login']]

    return login_df

In [10]:
def fetching_features(invitation_df, job_posting_df, kit_df, recorded_interview_df, live_interview_df):
    for df in [invitation_df, job_posting_df, kit_df, recorded_interview_df, live_interview_df]:
        if 'catcher_id' in df.columns:
            df.rename(columns={'catcher_id': 'jobma_catcher_id'}, inplace=True)

    invitation_df['number_of_invitations'] = invitation_df['jobma_catcher_id'].map(invitation_df['jobma_catcher_id'].value_counts())
    job_posting_df['job_posted'] = job_posting_df['jobma_catcher_id'].map(job_posting_df['jobma_catcher_id'].value_counts())
    kit_df['number_of_kits'] = kit_df['jobma_catcher_id'].map(kit_df['jobma_catcher_id'].value_counts())
    recorded_interview_df['number_of_recorded_interviews'] = recorded_interview_df['jobma_catcher_id'].map(recorded_interview_df['jobma_catcher_id'].value_counts())
    live_interview_df['number_of_live_interviews'] = live_interview_df['jobma_catcher_id'].map(live_interview_df['jobma_catcher_id'].value_counts())

    invitation_df = invitation_df[['jobma_catcher_id', 'number_of_invitations']].drop_duplicates()
    job_posting_df = job_posting_df[['jobma_catcher_id', 'job_posted']].drop_duplicates()
    kit_df = kit_df[['jobma_catcher_id', 'number_of_kits']].drop_duplicates()
    recorded_interview_df = recorded_interview_df[['jobma_catcher_id', 'number_of_recorded_interviews']].drop_duplicates()
    live_interview_df = live_interview_df[['jobma_catcher_id', 'number_of_live_interviews']].drop_duplicates()

    return invitation_df, job_posting_df, kit_df, recorded_interview_df, live_interview_df

In [11]:
def merging_df(df):
    final_df = parents_df.copy()

    # Left join each table one by one
    final_df = final_df.merge(wallet_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(subscription_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(invitation_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(job_posting_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(kit_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(recorded_interview_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(live_interview_df, on='jobma_catcher_id', how='left')
    final_df = final_df.merge(login_df, on='jobma_catcher_id', how='left')
    final_df.drop_duplicates(inplace=True)

    print(f"Final merged df shape is {final_df.shape}")

    return final_df

In [12]:
''' This Function is to fill all missing values '''

def fill_missing_values(df):
    final_df = final_df.copy()
    fill_values = {
        'is_premium': 0,
        'subscription_status': 1,
        'company_size': 'More than 1000',
        'total_sub': 0,
        'is_unlimited': 1,
        'subscription_amount_in_dollars': 0,
        'number_of_subscriptions': 0,
        'number_of_invitations': 0,
        'job_posted': 0,
        'number_of_kits': 0,
        'number_of_recorded_interviews': 0,
        'number_of_live_interviews': 0,
        'days_since_last_login': 'More than 1 Year'
    }
    return final_df.fillna(fill_values)

In [13]:
''' Data Encoding '''

def ordinal_encoder(df):
    ordinal_col = ['company_size', 'days_since_last_login']
    company_size_order = ['1-25', '26-100', '101-500', '500-1000', 'More than 1000']
    login_days_order = ['Less than 1 Week', '1-4 Weeks', '1-3 Months', '3-6 Months', '6-12 Months', 'More than 1 Year']

    total_order = [company_size_order, login_days_order]
    ordinal = OrdinalEncoder(categories=total_order)

    encoded = ordinal.fit_transform(final_df[ordinal_col].astype(str))
    encoded += 1

    encoded_df = pd.DataFrame(encoded, columms=[f' {col}_ord' for col in ordinal_col], index=final_df.index)

    final_df.drop(columms=ordinal_col, inplace=True)

    df = pd.concat([final_df, encoded_df], axis=1)

    return df

In [14]:
''' Log Transformation '''

def log_transform(df):
    log_cols = [
        'total_sub',
        'subscription_amount_in_dollars',
        'number_of_subscriptions',
        'number_of_invitations',
        'job_posted',
        'number_of_kits',
        'number_of_recorded_interviews',
        'number_of_live_interviews',
        'days_since_last_login'
    ]

    df = df.copy()
    for col in log_cols:
        if col in df.columns:
            # fill NaNs
            df[col] = df[col].fillna(0)
            # if a number is less than zero, turn it into zero
            df[col] = df[col].clip(lower=0)
            # safe log1p
            df[col] = np.log1p(df[col])

    return df

In [15]:
# '''  '''
# compare_df = df.copy()
# df.drop('jobma_catcher_id', axis=1, inplace=True)

# Pipeline

# Fit Pipeline

In [None]:
X = df

In [None]:
type(X)

In [None]:
len(X)

In [None]:
X_transformed = pipeline.fit_transform(X)

In [None]:
type(X_transformed)

In [None]:
len(X_transformed)

In [None]:
X_df = pd.DataFrame(X_transformed)

In [None]:
X_df.shape[1]

# Convert into Tensor

In [None]:
X_tensor = torch.tensor(X_transformed, dtype=torch.float32).to(device)

In [None]:
type(X_tensor)

# Dataset

In [53]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
data = CustomDataset(X_tensor)

# DataLoader

In [57]:
BATCH_SIZE = 16

In [None]:
dataloader = Dataloader(data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
len(data)

In [None]:
len(dataloader)

# Define a Model (AutoEncoder in this case)

In [65]:
class AutoEncoder(nn.Module):
    def __init__(self, input_shape):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32)
        )

        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_shape)
        )

    def forward(self, X):
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
''' Initializing the Model '''

input_shape = X_df.shape[1]
model_1 = AutoEncoder(input_shape)
model_1.to(device)