In [1]:
import pandas as pd
import numpy as np
import os

# Torch 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Scikit-Learn
# Data Encoding and Scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler

# # Natural Language Processing(NLP)
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# # Word Embedding
import gensim
from gensim.models import Word2Vec

In [2]:
nltk.download('punkt')      # For tokenization
nltk.download('stopwords')  # For stopword removal

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
df = pd.read_csv('IntervieweeDataset.csv')

In [5]:
df.head(1)

Unnamed: 0,Name,Age,Gender,Type of Graduation/Post Graduation,Marital status,Mode of interview given by candidate?,Pre Interview Check,Fluency in English based on introduction,Confidence based on Introduction (English),Confidence based on the topic given,...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not\n
0,parida,25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Impactful - Good confidence throughout the Int...,Guarded Confidence - Confident in some areas a...,...,3,2,1,1.0,1.0,11,7,3,42,No


In [6]:
df.dropna(inplace=True)

In [7]:
df.drop(df.iloc[:,8:18], axis=1, inplace=True)

In [8]:
df.drop(['Does the candidate has mother tongue influence while speaking english.', 'How many slides candidate have submitted in PPT?'], axis=1, inplace=True)

In [9]:
df.shape

(18681, 40)

In [10]:
df.rename(columns={'Whether joined the company or not\n': 'Whether joined the company or not', 'What was the type of Role?\t': 'What was the type of Role?'}, inplace=True)

In [11]:
categorical_cols = ['Gender', 'Type of Graduation/Post Graduation', 'Marital status', 'Mode of interview given by candidate?', 'Has acquaintance in Company and has spoken to him/her before applying?']
numerical_cols = ['Confidence Score', 'Structured Thinking Score', 'Regional Fluency Score', 'Total Score']
text_col = ['Comments']
target_col = 'Whether joined the company or not'

In [12]:
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    df[col] = label_encoders[col].fit_transform(df[col].astype(str))

In [13]:
df[target_col].unique()

array(['No', 'Yes', 'Not Joined', 'Joined'], dtype=object)

In [14]:
df[target_col] = df[target_col].replace({'Joined':'Yes', 'Not Joined':'No'})

In [15]:
df[target_col].unique()

array(['No', 'Yes'], dtype=object)

In [16]:
# Encode target variable
df[target_col] = LabelEncoder().fit_transform(df[target_col].astype(str))

In [17]:
df.head(4)

Unnamed: 0,Name,Age,Gender,Type of Graduation/Post Graduation,Marital status,Mode of interview given by candidate?,Pre Interview Check,Fluency in English based on introduction,Has acquaintance in Company and has spoken to him/her before applying?,Candidate Status,...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not
0,parida,25,0,134,2,1,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,0,Experienced in non client facing(equal to or m...,...,3,2,1,1.0,1.0,11,7,3,42,0
1,shreej,29,0,60,2,1,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,0,Lateral(2021 and before with (less than 6 mont...,...,3,3,3,3.0,3.0,12,9,9,60,0
2,ms6744,27,0,13,2,1,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,1,Fresher(only 2022 grad),...,3,3,3,1.0,3.0,10,9,7,52,1
3,aswalu,22,1,13,2,0,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,0,Fresher(only 2022 grad),...,3,3,3,3.0,3.0,12,9,9,60,1


In [18]:
scaler = StandardScaler()
scaler

In [19]:
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [20]:
df.head(4)

Unnamed: 0,Name,Age,Gender,Type of Graduation/Post Graduation,Marital status,Mode of interview given by candidate?,Pre Interview Check,Fluency in English based on introduction,Has acquaintance in Company and has spoken to him/her before applying?,Candidate Status,...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not
0,parida,25,0,134,2,1,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,0,Experienced in non client facing(equal to or m...,...,3,2,1,1.0,1.0,0.95149,-0.022276,-1.320677,-0.198863,0
1,shreej,29,0,60,2,1,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,0,Lateral(2021 and before with (less than 6 mont...,...,3,3,3,3.0,3.0,1.415087,1.256178,1.264403,1.509342,0
2,ms6744,27,0,13,2,1,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,1,Fresher(only 2022 grad),...,3,3,3,1.0,3.0,0.487893,1.256178,0.402709,0.750139,1
3,aswalu,22,1,13,2,0,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,0,Fresher(only 2022 grad),...,3,3,3,3.0,3.0,1.415087,1.256178,1.264403,1.509342,1


In [21]:
# Preprocessing Text(Stopwords and Stemming)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [22]:
def formatting(text):
    words = word_tokenize(str(text).lower())
    words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
    return words

In [23]:
df['Comments'] = df['Comments'].apply(formatting)

In [24]:
# Train Word2Vec Model
w2v_model = Word2Vec(sentences=df['Comments'], vector_size=100, window=5, min_count=1, workers=4)
w2v_model

<gensim.models.word2vec.Word2Vec at 0x1c7deda4050>

In [25]:
def get_text_embedding(text):
    words = formatting(text)
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

In [26]:
df['Comments'] = df['Comments'].apply(get_text_embedding)

In [27]:
df_embeddings = np.vstack(df['Comments'].values)

## Convert Data into Tensors

In [33]:
X_categorical = torch.tensor(df[categorical_cols].values, dtype=torch.long)
X_numerical = torch.tensor(df[numerical_cols].values, dtype=torch.float32)
X_text = torch.tensor(df_embeddings, dtype=torch.float32)
y = torch.tensor(df[target_col].values, dtype=torch.long)

## Dataset

In [34]:
class CustomDataset(Dataset):
    def __init__(self, X_categorical, X_numerical, X_text, y):
        self.X_categorical = X_categorical
        self.X_numerical = X_numerical
        self.X_text = X_text
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_categorical[idx], self.X_numerical[idx], self.X_text[idx], self.y[idx]

In [35]:
dataset = CustomDataset(X_categorical, X_numerical, X_text, y)

## DataLoader

In [36]:
workers = os.cpu_count()
workers

8

In [37]:
BATCH_SIZE = 32

In [38]:
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=workers)

In [39]:
dataset, len(dataset)

(<__main__.CustomDataset at 0x1c7df175100>, 18681)

In [40]:
dataloader, len(dataloader)

(<torch.utils.data.dataloader.DataLoader at 0x1c7df167b30>, 584)

## Build a Neural Network

In [None]:
class HybridRecommender(nn.Module):
    def __init(self, num_categorical, embedding_dim, num_numerical, text_embedding_dim):
        super().__init__()
        
        