In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Torch 
import torch
import torch.nn as nn
from torch.utils.data import random_split

# Scikit-Learn
# Data Encoding and Scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Natural Language Processing(NLP)
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Word Embedding
import gensim
from gensim.models import Word2Vec

In [2]:
nltk.download('punkt')      # For tokenization
nltk.download('stopwords')  # For stopword removal

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
df = pd.read_csv('IntervieweeDataset.csv')

In [5]:
df.head(1)

Unnamed: 0,Name,Age,Gender,Type of Graduation/Post Graduation,Marital status,Mode of interview given by candidate?,Pre Interview Check,Fluency in English based on introduction,Confidence based on Introduction (English),Confidence based on the topic given,...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not\n
0,parida,25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Impactful - Good confidence throughout the Int...,Guarded Confidence - Confident in some areas a...,...,3,2,1,1.0,1.0,11,7,3,42,No


In [6]:
df.shape

(21256, 52)

In [7]:
df.dropna(inplace=True)

In [8]:
df.shape

(18681, 52)

In [9]:
df.drop(df.iloc[:,8:18], axis=1, inplace=True)

In [10]:
df.drop(['Does the candidate has mother tongue influence while speaking english.', 'How many slides candidate have submitted in PPT?'], axis=1, inplace=True)

In [11]:
df.shape

(18681, 40)

In [12]:
df.rename(columns={'Type of Graduation/Post Graduation':'Education', 'Mode of interview given by candidate?':'Mode of Interview', 'Has acquaintance in Company and has spoken to him/her before applying?':'Acquaintance and Referral'}, inplace=True)

In [13]:
df.rename(columns={'Whether joined the company or not\n': 'Whether joined the company or not', 'What was the type of Role?\t': 'What was the type of Role?'}, inplace=True)

In [14]:
df.rename(columns={"But, my child's exam are going on now, so we will keep the counselling session after the exams get over.(Time: Favourable pitch: Counsellor hype)":"But, my child's exam are going on now, so we will keep the counselling session after the exams get over"}, inplace=True)

In [15]:
df.columns

Index(['Name', 'Age', 'Gender', 'Education', 'Marital status',
       'Mode of Interview', 'Pre Interview Check',
       'Fluency in English based on introduction', 'Acquaintance and Referral',
       'Candidate Status', 'Last Fixed CTC (lakhs) ', 'Currently Employed',
       'Experienced candidate - (Experience in months)',
       'Experienced Candidate (Nature of work)', 'What was the type of Role?',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role acceptance', 'Interview Verdict',
       'Candidate is willing to relocate',
       'Role Location to be given to the candidate', 'Comments',
       'RedFlags Comments in Interview',
       'Confidence based on Introduction (Engl

In [16]:
df['Whether joined the company or not'].unique()

array(['No', 'Yes', 'Not Joined', 'Joined'], dtype=object)

In [17]:
df['Whether joined the company or not'] = df['Whether joined the company or not'].replace({'Joined':'Yes', 'Not Joined':'No'})

In [18]:
df['Whether joined the company or not'] = df['Whether joined the company or not'].map({'Yes': 1, 'No': 0}).astype(int)

In [19]:
df['Whether joined the company or not'].unique()

array([0, 1])

In [20]:
df['Age'].unique()

array(['25', '29', '27', '22', '23', '24', '26', '21', '30', '20', '31',
       '28', '32+', '32'], dtype=object)

In [21]:
df['Age'] = df['Age'].astype(str).str.replace('+', '', regex=False).astype(int)

In [22]:
df['Age'].unique()

array([25, 29, 27, 22, 23, 24, 26, 21, 30, 20, 31, 28, 32])

In [23]:
bins = [18, 22, 25, 28, 32, 35, float('inf')]
labels = ['18-22', '23-25', '26-28', '29-32', '33-35', '35+']

In [24]:
df['Age'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

In [25]:
df['Age'].unique()

['23-25', '29-32', '26-28', '18-22']
Categories (6, object): ['18-22' < '23-25' < '26-28' < '29-32' < '33-35' < '35+']

In [26]:
df.columns

Index(['Name', 'Age', 'Gender', 'Education', 'Marital status',
       'Mode of Interview', 'Pre Interview Check',
       'Fluency in English based on introduction', 'Acquaintance and Referral',
       'Candidate Status', 'Last Fixed CTC (lakhs) ', 'Currently Employed',
       'Experienced candidate - (Experience in months)',
       'Experienced Candidate (Nature of work)', 'What was the type of Role?',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role acceptance', 'Interview Verdict',
       'Candidate is willing to relocate',
       'Role Location to be given to the candidate', 'Comments',
       'RedFlags Comments in Interview',
       'Confidence based on Introduction (Engl

In [27]:
df.shape

(18681, 40)

In [28]:
df['id'] = df.index

In [29]:
df.head(2)

Unnamed: 0,Name,Age,Gender,Education,Marital status,Mode of Interview,Pre Interview Check,Fluency in English based on introduction,Acquaintance and Referral,Candidate Status,...,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not,id
0,parida,23-25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Experienced in non client facing(equal to or m...,...,2,1,1.0,1.0,11,7,3,42,0,0
1,shreej,29-32,Female,BSc or MSc,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Lateral(2021 and before with (less than 6 mont...,...,3,3,3.0,3.0,12,9,9,60,0,1


# New Dataframe for only Numerical Operations (excluding Name column from it)

In [30]:
numerical_df = df.copy()
numerical_df.drop(['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                   'Whether joined the company or not',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate'],
                  axis=1,inplace=True)

# Data Scaling
The StandardScaler from sklearn.preprocessing transforms data to have:

Mean = 0 ,
Standard Deviation = 1

In [31]:
scaler = StandardScaler()
scaler

In [32]:
temp_list = ['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score']

In [33]:
numerical_df[temp_list] = scaler.fit_transform(numerical_df[temp_list])
print("Means after scaling:\n", numerical_df[temp_list].mean().round(5))
print("Standard deviations after scaling:\n", numerical_df[temp_list].std().round(5))

Means after scaling:
 Confidence based on Introduction (English).1      -0.0
Confidence based on the topic given  .1           -0.0
Confidence Based on the PPT Question.1            -0.0
Confidence based on the sales scenario.1          -0.0
Structured Thinking (In regional only).1           0.0
Structured Thinking Based on the PPT Question.1   -0.0
Structured Thinking( Call pitch).1                -0.0
Regional fluency based on the topic given  .1      0.0
Regional fluency Based on the PPT Question.1      -0.0
Regional fluency based on the  sales scenario.1   -0.0
Confidence Score                                   0.0
Structured Thinking Score                         -0.0
Regional Fluency Score                             0.0
Total Score                                        0.0
dtype: float64
Standard deviations after scaling:
 Confidence based on Introduction (English).1       1.00003
Confidence based on the topic given  .1            1.00003
Confidence Based on the PPT Question.1 

In [34]:
numerical_df.head()

Unnamed: 0,Age,Gender,Education,Marital status,Mode of Interview,Pre Interview Check,Fluency in English based on introduction,Acquaintance and Referral,Candidate Status,Last Fixed CTC (lakhs),...,Structured Thinking (In regional only).1,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score
0,23-25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Experienced in non client facing(equal to or m...,5-5.99,...,-0.618598,1.020654,-0.424411,-1.092966,-1.063625,-1.043289,0.95149,-0.022276,-1.320677,-0.198863
1,29-32,Female,BSc or MSc,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Lateral(2021 and before with (less than 6 mont...,Fresher,...,0.985514,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342
2,26-28,Female,B.E / B-Tech,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Yes,Fresher(only 2022 grad),Fresher,...,0.985514,1.020654,1.006713,0.979581,-1.063625,1.060405,0.487893,1.256178,0.402709,0.750139
3,18-22,Male,B.E / B-Tech,Unmarried,Laptop,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Fresher(only 2022 grad),Fresher,...,0.985514,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342
4,18-22,Male,BA/MA,Unmarried,Mobile,Proceed with the Interview,Taking gaps while speaking due to lack of cont...,No,Fresher(only 2022 grad),Fresher,...,-0.618598,-0.55603,-1.855535,0.979581,-1.063625,1.060405,0.487893,-1.300729,0.402709,-0.009063


In [35]:
numerical_df.shape

(18681, 31)

# Data Encoding

In [36]:
numerical_df['Role acceptance'].value_counts()

Role acceptance
Emphatic Yes                                        12456
Yes : Think and says yes.(Shows some hesitation)     5475
No                                                    750
Name: count, dtype: int64

In [37]:
transformer = ColumnTransformer(transformers = [
    ('t1', OneHotEncoder(sparse_output=False, drop='first'), ['Gender', 'Experienced Candidate (Nature of work)', 'What was the type of Role?', 'Currently Employed', 'Marital status', 'Mode of Interview', 'Pre Interview Check', 'Fluency in English based on introduction', 'Acquaintance and Referral', 'Candidate Status', 'Education']),
    ('t2', OrdinalEncoder(categories=[['Fresher','0-1.99','2-2.99','3-3.99','4-4.99','5-5.99','6-6.99','7+']]), ['Last Fixed CTC (lakhs) ']),
    ('t3', OrdinalEncoder(categories=[['No - Want Specific Centre Location Only','Yes - Anywhere Within a City','Yes - Anywhere Within a State','Yes - Anywhere in PAN India']]), ['Candidate is willing to relocate']),
    ('t4', OrdinalEncoder(categories=[['Reject','Borderline Reject','Borderline Select','Select','Premium Select']]), ['Interview Verdict']),
    ('t5', OrdinalEncoder(categories=[['No','Yes : Think and says yes.(Shows some hesitation)','Emphatic Yes']]), ['Role acceptance']),
    ('t6', OrdinalEncoder(categories=[['Fresher(<6 months)','6-11.99 Months','12-17.99 Months','18-23.99 Months','24-29.99 Months','30-35.99 Months','36-47.99 Months','48+ Months']]), ['Experienced candidate - (Experience in months)']),
    ('t7', OrdinalEncoder(categories=[['18-22', '23-25', '26-28', '29-32', '33-35', '35+']]), ['Age'])
], remainder='passthrough')

In [38]:
numerical_df = transformer.fit_transform(numerical_df)

In [39]:
type(numerical_df)

numpy.ndarray

In [40]:
features_names = transformer.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [41]:
numerical_df.shape

(18681, 1897)

In [42]:
numerical_df.head(2)

Unnamed: 0,t1__Gender_Male,t1__Experienced Candidate (Nature of work)_ DISPATCHER IN US COMPANY,t1__Experienced Candidate (Nature of work)_ Laboratory Attendant,t1__Experienced Candidate (Nature of work)_ Network Engineer,t1__Experienced Candidate (Nature of work)_ no,t1__Experienced Candidate (Nature of work)_ proctor,t1__Experienced Candidate (Nature of work)_-,t1__Experienced Candidate (Nature of work)_.,t1__Experienced Candidate (Nature of work)_/,t1__Experienced Candidate (Nature of work)_10 lakhs lone,...,remainder__Structured Thinking (In regional only).1,remainder__Structured Thinking Based on the PPT Question.1,remainder__Structured Thinking( Call pitch).1,remainder__Regional fluency based on the topic given .1,remainder__Regional fluency Based on the PPT Question.1,remainder__Regional fluency based on the sales scenario.1,remainder__Confidence Score,remainder__Structured Thinking Score,remainder__Regional Fluency Score,remainder__Total Score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.618598,1.020654,-0.424411,-1.092966,-1.063625,-1.043289,0.95149,-0.022276,-1.320677,-0.198863
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.985514,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342


# Principal Component Analysis(PCA) for Dimensionality Reduction
**Required to reduce the number of features**

In [43]:
pca = PCA(n_components=0.95)
numerical_pca = pca.fit_transform(numerical_df)

In [44]:
type(numerical_pca)

numpy.ndarray

In [45]:
features_names = pca.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_pca, columns=features_names)

In [46]:
type(numerical_df)

pandas.core.frame.DataFrame

In [47]:
numerical_df.shape

(18681, 22)

In [48]:
numerical_df.head()

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca12,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21
0,-0.041491,1.821674,-1.018671,-1.629676,2.121754,-0.930258,-3.278468,2.380039,1.188027,1.186568,...,-1.022521,-0.207322,1.095852,0.59949,0.066089,-0.435492,0.144108,-0.229863,-0.576524,-0.727757
1,2.934609,-2.274258,1.599145,0.531347,1.407868,1.304165,-1.878531,-1.936549,2.169131,0.738025,...,-0.076328,-0.851957,0.278085,-0.018831,0.463886,-0.000188,-0.03203,0.022952,0.04318,-0.381693
2,2.152953,-2.07201,-1.080818,-0.274634,-0.959109,0.319263,1.688017,-0.449569,1.279477,0.48779,...,-1.096375,0.497548,-0.120831,0.538293,0.180238,-0.3136,-0.807497,0.636418,0.757597,-0.503597
3,3.894837,-3.404232,-0.093392,1.141162,0.336833,-0.232041,0.178308,0.197593,-0.383692,-0.771922,...,0.007924,0.755882,-0.086455,-0.036561,0.26953,-0.189405,-0.086062,-0.346854,0.054016,0.15745
4,-0.945086,-1.526782,1.592172,-1.561385,0.47781,-1.553965,1.63108,-0.181753,-0.040788,-0.144976,...,-1.072113,-0.128938,0.135667,0.187743,-0.484645,0.297314,-0.579843,-0.016847,-0.168577,0.153979


In [49]:
numerical_df.isnull().sum()

pca0     0
pca1     0
pca2     0
pca3     0
pca4     0
pca5     0
pca6     0
pca7     0
pca8     0
pca9     0
pca10    0
pca11    0
pca12    0
pca13    0
pca14    0
pca15    0
pca16    0
pca17    0
pca18    0
pca19    0
pca20    0
pca21    0
dtype: int64

In [50]:
df = df[['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                'Whether joined the company or not',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate']]

In [51]:
df['Whether joined the company or not'].unique()

array([0, 1])

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18681 entries, 0 to 21255
Data columns (total 10 columns):
 #   Column                                                                                                                   Non-Null Count  Dtype 
---  ------                                                                                                                   --------------  ----- 
 0   id                                                                                                                       18681 non-null  int64 
 1   Name                                                                                                                     18681 non-null  object
 2   Comments                                                                                                                 18681 non-null  object
 3   RedFlags Comments in Interview                                                                                           18681 non-null  object
 4

In [53]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x:x.split())

In [54]:
df.head()

Unnamed: 0,id,Name,Comments,RedFlags Comments in Interview,Call-pitch Elements used during the call Sales Scenario,"But, my child's exam are going on now, so we will keep the counselling session after the exams get over",Let me discuss it with my child,Whether joined the company or not,Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.,Role Location to be given to the candidate
0,0,[parida],"[Lipsa, is, 25, female, from, Orissa,, Family,...","[At, least, Graduated, (, not, 12th, Pass, or,...","[Purpose, of, Call, (Book, a, Counselling, Ses...","[Urgency, using, Time]","[None, of, the, above]",0,"[None, of, the, above]",[Bangalore]
1,1,[shreej],"[29, yo, /, female, /, unmarried, /, MSc, Fina...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Asking, Questions]","[None, of, the, above]",0,"[Non, chargeable, session]",[Delhi]
2,2,[ms6744],[nm],"[Not, working, currently, and, ready, to, join...","[Introduction, (Self, Intro,Company, Name)]","[Urgency, using, Time]","[Decision, Making:, Major, decision, of, child...",1,"[Non, chargeable, session]",[Chennai]
3,3,[aswalu],"[Final, Interview, Done, 22YRS, //UTTARAKHAND,...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Urgency, using, Time,, Urgency, using, situat...","[None, of, the, above]",1,"[Value, creation(Nothing, comes, for, free)]",[Delhi]
4,4,[aniket],"[Aniket, is, 22, male, from, Bhopal,, BA, Grad...","[Age, is, below, 32.11,, At, least, Graduated,...","[None, of, the, Above]","[Urgency, by, creating, counsellor, Hype]","[Anticipation, of, objection, from, child's, e...",0,"[Value, creation(Nothing, comes, for, free)]",[Bangalore]


In [55]:
df.columns

Index(['id', 'Name', 'Comments', 'RedFlags Comments in Interview',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child', 'Whether joined the company or not',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role Location to be given to the candidate'],
      dtype='object')

In [56]:
df['tags'] = df['Name'] + df['Comments'] + df['RedFlags Comments in Interview'] + df['Call-pitch Elements used during the call Sales Scenario'] + df["But, my child's exam are going on now, so we will keep the counselling session after the exams get over"] + df['Let me discuss it with my child'] + df["Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app."] + df['Role Location to be given to the candidate']

In [57]:
df.head()

Unnamed: 0,id,Name,Comments,RedFlags Comments in Interview,Call-pitch Elements used during the call Sales Scenario,"But, my child's exam are going on now, so we will keep the counselling session after the exams get over",Let me discuss it with my child,Whether joined the company or not,Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.,Role Location to be given to the candidate,tags
0,0,[parida],"[Lipsa, is, 25, female, from, Orissa,, Family,...","[At, least, Graduated, (, not, 12th, Pass, or,...","[Purpose, of, Call, (Book, a, Counselling, Ses...","[Urgency, using, Time]","[None, of, the, above]",0,"[None, of, the, above]",[Bangalore],"[parida, Lipsa, is, 25, female, from, Orissa,,..."
1,1,[shreej],"[29, yo, /, female, /, unmarried, /, MSc, Fina...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Asking, Questions]","[None, of, the, above]",0,"[Non, chargeable, session]",[Delhi],"[shreej, 29, yo, /, female, /, unmarried, /, M..."
2,2,[ms6744],[nm],"[Not, working, currently, and, ready, to, join...","[Introduction, (Self, Intro,Company, Name)]","[Urgency, using, Time]","[Decision, Making:, Major, decision, of, child...",1,"[Non, chargeable, session]",[Chennai],"[ms6744, nm, Not, working, currently, and, rea..."
3,3,[aswalu],"[Final, Interview, Done, 22YRS, //UTTARAKHAND,...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Urgency, using, Time,, Urgency, using, situat...","[None, of, the, above]",1,"[Value, creation(Nothing, comes, for, free)]",[Delhi],"[aswalu, Final, Interview, Done, 22YRS, //UTTA..."
4,4,[aniket],"[Aniket, is, 22, male, from, Bhopal,, BA, Grad...","[Age, is, below, 32.11,, At, least, Graduated,...","[None, of, the, Above]","[Urgency, by, creating, counsellor, Hype]","[Anticipation, of, objection, from, child's, e...",0,"[Value, creation(Nothing, comes, for, free)]",[Bangalore],"[aniket, Aniket, is, 22, male, from, Bhopal,, ..."


In [58]:
df.loc[1,'tags']

['shreej',
 '29',
 'yo',
 '/',
 'female',
 '/',
 'unmarried',
 '/',
 'MSc',
 'Finance',
 'UK',
 '2022',
 '/',
 'recently',
 'relocated',
 'to',
 'India',
 'just',
 '20',
 'days',
 'ago',
 '/',
 'Odissa',
 '/',
 'father',
 '-',
 'doctor',
 '/',
 'bro',
 '-',
 'software',
 'engineer',
 '/',
 'pursuing',
 'internship',
 'in',
 'investment',
 'banking',
 '-',
 'ends',
 'in',
 'next',
 'week',
 '/',
 'worked',
 'as',
 'a',
 'teacher',
 '/',
 'good',
 'story',
 'teller',
 '-',
 'fluent',
 'in',
 'hindi',
 'and',
 'english',
 '-',
 'good',
 'energy',
 '/',
 'also',
 'prepared',
 'for',
 'UPSC',
 'and',
 'GMAT',
 '-',
 'purchased',
 'a',
 'course',
 'from',
 "byju's",
 'earlier',
 '/',
 'cracked',
 'GMAT',
 'and',
 'got',
 'business',
 'school',
 'as',
 'well/asking',
 'ques',
 'in',
 'sales',
 'scenario',
 '-',
 '2/5',
 'Not',
 'Rehire',
 '(CDT',
 'have',
 'not',
 'joined',
 'byjus',
 'in',
 'sales',
 'before),',
 'Not',
 'Interviewed',
 'in',
 'Byjus',
 'in',
 'sales',
 'role',
 'in',
 'last

In [59]:
df = df[['id','Name','tags','Whether joined the company or not']]

In [60]:
df.head(2)

Unnamed: 0,id,Name,tags,Whether joined the company or not
0,0,[parida],"[parida, Lipsa, is, 25, female, from, Orissa,,...",0
1,1,[shreej],"[shreej, 29, yo, /, female, /, unmarried, /, M...",0


## Stemming
**Note: Use Lemmatization for more accuracy**

To normalize words and reduce them to their root forms, we will apply **stemming**. This helps in handling variations of words and improves text processing efficiency for machine learning models.  
(e.g., "running" → "run")

In [61]:
ps = PorterStemmer()
ps

<PorterStemmer>

In [62]:
stop_words = set(stopwords.words('english'))

In [63]:
def stem(text):
    y = []

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)    

In [64]:
''' Applying Stemming '''

df['tags'] = df['tags'].apply(stem)

In [65]:
df.head()

Unnamed: 0,id,Name,tags,Whether joined the company or not
0,0,[parida],"parida lipsa is 25 femal from orissa, famili b...",0
1,1,[shreej],shreej 29 yo / femal / unmarri / msc financ uk...,0
2,2,[ms6744],ms6744 nm not work current and readi to join t...,1
3,3,[aswalu],aswalu final interview done 22yr //uttarakhand...,1
4,4,[aniket],"aniket aniket is 22 male from bhopal, ba grad ...",0


In [66]:
df.loc[1,'tags']

"shreej 29 yo / femal / unmarri / msc financ uk 2022 / recent reloc to india just 20 day ago / odissa / father - doctor / bro - softwar engin / pursu internship in invest bank - end in next week / work as a teacher / good stori teller - fluent in hindi and english - good energi / also prepar for upsc and gmat - purchas a cours from byju' earlier / crack gmat and got busi school as well/ask que in sale scenario - 2/5 not rehir (cdt have not join byju in sale before), not interview in byju in sale role in last 90days, laptop and wifi are available, age is below 32.11, at least graduat ( not 12th pass or diploma or final year student), will to reloc at given locat for ssp, comfort with the stipend & allow dure training, not work current and readi to join the r3 process introduct (self intro,compani name), purpos of call (book a counsel session), need gener - by ask que like student class, perform etc.. ask question none of the abov non chargeabl session delhi"

In [67]:
df.isnull().sum()

id                                   0
Name                                 0
tags                                 0
Whether joined the company or not    0
dtype: int64

In [68]:
def stopwords_removal(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [69]:
''' Applying Stopwords Removal '''

df['tags'] = df['tags'].apply(stopwords_removal)

In [70]:
df.head(2)

Unnamed: 0,id,Name,tags,Whether joined the company or not
0,0,[parida],"parida lipsa 25 femal orissa , famili bg - fat...",0
1,1,[shreej],shreej 29 yo / femal / unmarri / msc financ uk...,0


In [71]:
df.loc[1,'tags']

"shreej 29 yo / femal / unmarri / msc financ uk 2022 / recent reloc india 20 day ago / odissa / father - doctor / bro - softwar engin / pursu internship invest bank - end next week / work teacher / good stori teller - fluent hindi english - good energi / also prepar upsc gmat - purchas cours byju ' earlier / crack gmat got busi school well/ask que sale scenario - 2/5 rehir ( cdt join byju sale ) , interview byju sale role last 90days , laptop wifi available , age 32.11 , least graduat ( 12th pass diploma final year student ) , reloc given locat ssp , comfort stipend & allow dure training , work current readi join r3 process introduct ( self intro , compani name ) , purpos call ( book counsel session ) , need gener - ask que like student class , perform etc .. ask question none abov non chargeabl session delhi"

In [72]:
def remove_duplicates(text):
    words = text.split()
    seen = set()
    unique_words = []

    for word in words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)

    return " ".join(unique_words)        

In [73]:
df['tags'] = df['tags'].apply(remove_duplicates)

In [74]:
df['tags'] = df['tags'].str.replace('/', '', regex=False)

In [75]:
df.loc[1,'tags']

"shreej 29 yo  femal unmarri msc financ uk 2022 recent reloc india 20 day ago odissa father - doctor bro softwar engin pursu internship invest bank end next week work teacher good stori teller fluent hindi english energi also prepar upsc gmat purchas cours byju ' earlier crack got busi school wellask que sale scenario 25 rehir ( cdt join ) , interview role last 90days laptop wifi available age 32.11 least graduat 12th pass diploma final year student given locat ssp comfort stipend & allow dure training current readi r3 process introduct self intro compani name purpos call book counsel session need gener ask like class perform etc .. question none abov non chargeabl delhi"

In [76]:
df = df.join(numerical_df, how='inner')

In [77]:
df.shape

(16386, 26)

In [78]:
df.head(5)

Unnamed: 0,id,Name,tags,Whether joined the company or not,pca0,pca1,pca2,pca3,pca4,pca5,...,pca12,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21
0,0,[parida],"parida lipsa 25 femal orissa , famili bg - fat...",0,-0.041491,1.821674,-1.018671,-1.629676,2.121754,-0.930258,...,-1.022521,-0.207322,1.095852,0.59949,0.066089,-0.435492,0.144108,-0.229863,-0.576524,-0.727757
1,1,[shreej],shreej 29 yo femal unmarri msc financ uk 2022...,0,2.934609,-2.274258,1.599145,0.531347,1.407868,1.304165,...,-0.076328,-0.851957,0.278085,-0.018831,0.463886,-0.000188,-0.03203,0.022952,0.04318,-0.381693
2,2,[ms6744],ms6744 nm work current readi join r3 process i...,1,2.152953,-2.07201,-1.080818,-0.274634,-0.959109,0.319263,...,-1.096375,0.497548,-0.120831,0.538293,0.180238,-0.3136,-0.807497,0.636418,0.757597,-0.503597
3,3,[aswalu],aswalu final interview done 22yr uttarakhand ...,1,3.894837,-3.404232,-0.093392,1.141162,0.336833,-0.232041,...,0.007924,0.755882,-0.086455,-0.036561,0.26953,-0.189405,-0.086062,-0.346854,0.054016,0.15745
4,4,[aniket],"aniket 22 male bhopal , ba grad 21 famili bg -...",0,-0.945086,-1.526782,1.592172,-1.561385,0.47781,-1.553965,...,-1.072113,-0.128938,0.135667,0.187743,-0.484645,0.297314,-0.579843,-0.016847,-0.168577,0.153979


In [79]:
df.loc[1,'tags']

"shreej 29 yo  femal unmarri msc financ uk 2022 recent reloc india 20 day ago odissa father - doctor bro softwar engin pursu internship invest bank end next week work teacher good stori teller fluent hindi english energi also prepar upsc gmat purchas cours byju ' earlier crack got busi school wellask que sale scenario 25 rehir ( cdt join ) , interview role last 90days laptop wifi available age 32.11 least graduat 12th pass diploma final year student given locat ssp comfort stipend & allow dure training current readi r3 process introduct self intro compani name purpos call book counsel session need gener ask like class perform etc .. question none abov non chargeabl delhi"

In [80]:
experimental_df = df.copy()
experimental_df.drop('Name', axis=1, inplace=True)

In [81]:
experimental_df.head(2)

Unnamed: 0,id,tags,Whether joined the company or not,pca0,pca1,pca2,pca3,pca4,pca5,pca6,...,pca12,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21
0,0,"parida lipsa 25 femal orissa , famili bg - fat...",0,-0.041491,1.821674,-1.018671,-1.629676,2.121754,-0.930258,-3.278468,...,-1.022521,-0.207322,1.095852,0.59949,0.066089,-0.435492,0.144108,-0.229863,-0.576524,-0.727757
1,1,shreej 29 yo femal unmarri msc financ uk 2022...,0,2.934609,-2.274258,1.599145,0.531347,1.407868,1.304165,-1.878531,...,-0.076328,-0.851957,0.278085,-0.018831,0.463886,-0.000188,-0.03203,0.022952,0.04318,-0.381693


In [82]:
experimental_df.columns

Index(['id', 'tags', 'Whether joined the company or not', 'pca0', 'pca1',
       'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9', 'pca10',
       'pca11', 'pca12', 'pca13', 'pca14', 'pca15', 'pca16', 'pca17', 'pca18',
       'pca19', 'pca20', 'pca21'],
      dtype='object')

In [83]:
y = experimental_df['Whether joined the company or not']
X = experimental_df
X.drop('Whether joined the company or not', axis=1, inplace=True)

## Data Splitting

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
print(f'Length of X_train: {len(X_train)}')
print(f'Length of X_test: {len(X_test)}')
print(f'Length of y_train: {len(y_train)}')
print(f'Length of y_test: {len(y_test)}')

Length of X_train: 13108
Length of X_test: 3278
Length of y_train: 13108
Length of y_test: 3278


In [89]:
type(X_test)

pandas.core.frame.DataFrame

## Word Embeddings
**Note: Use Contextual Embeddings for More Accuracy**

To represent words in a numerical format while preserving their meaning and relationships, we will apply **word embeddings**. This helps in capturing semantic similarities and improving machine learning model performance.
(e.g., "king" → similar to "queen" but different from "apple").

In [None]:
# sentences = [tag.split() for tag in experimental_df['tags']]
# print(sentences)

In [None]:
# # Train Word2Vec model
# w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# w2v_model

In [None]:
# Function to convert text into an average word embedding
def text_to_embedding(text):
    words = text.split()
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)  # Handle empty strings

text_embeddings = np.array([text_to_embedding(text) for text in df['tags']])

print(text_embeddings.shape)

In [None]:
experimental_df.head()

In [None]:
numeric_df = experimental_df
numeric_df.drop(['id', 'tags', 'Whether joined the company or not'], axis=1, inplace=True)

In [None]:
numeric_arr = numeric_df.to_numpy()

In [None]:
text_tensor = torch.tensor(text_embeddings, dtype=torch.float32)
numeric_tensor = torch.tensor(numeric_arr, dtype=torch.float32)

In [None]:
text_tensor, text_tensor.shape

In [None]:
numeric_tensor, numeric_tensor.shape

In [None]:
final_tensor = torch.cat((text_tensor, numeric_tensor), dim=1)
print(final_tensor.shape)

## Data Splitting

In [None]:
train_size = int(0.8 * len(final_tensor))
test_size = len(final_tensor) - train_size

In [None]:
train_size

In [None]:
test_size

In [None]:
train_size + test_size

In [None]:
''' Dataset '''
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])