In [81]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Torch 
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

# Scikit-Learn
# Data Encoding and Scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

# Natural Language Processing(NLP)
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')      # For tokenization
nltk.download('stopwords')  # For stopword removal

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dushyant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
df = pd.read_csv('IntervieweeDataset.csv')

In [5]:
df.head(1)

Unnamed: 0,Name,Age,Gender,Type of Graduation/Post Graduation,Marital status,Mode of interview given by candidate?,Pre Interview Check,Fluency in English based on introduction,Confidence based on Introduction (English),Confidence based on the topic given,...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not\n
0,parida,25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Impactful - Good confidence throughout the Int...,Guarded Confidence - Confident in some areas a...,...,3,2,1,1.0,1.0,11,7,3,42,No


In [6]:
df.shape

(21256, 52)

In [7]:
df.dropna(inplace=True)

In [8]:
df.shape

(18681, 52)

In [9]:
df.drop(df.iloc[:,8:18], axis=1, inplace=True)

In [10]:
df.drop(['Does the candidate has mother tongue influence while speaking english.', 'How many slides candidate have submitted in PPT?'], axis=1, inplace=True)

In [11]:
df.shape

(18681, 40)

In [12]:
df.rename(columns={'Type of Graduation/Post Graduation':'Education', 'Mode of interview given by candidate?':'Mode of Interview', 'Has acquaintance in Company and has spoken to him/her before applying?':'Acquaintance and Referral'}, inplace=True)

In [13]:
df.rename(columns={'Whether joined the company or not\n': 'Whether joined the company or not', 'What was the type of Role?\t': 'What was the type of Role?'}, inplace=True)

In [14]:
df.rename(columns={"But, my child's exam are going on now, so we will keep the counselling session after the exams get over.(Time: Favourable pitch: Counsellor hype)":"But, my child's exam are going on now, so we will keep the counselling session after the exams get over"}, inplace=True)

In [15]:
df.columns

Index(['Name', 'Age', 'Gender', 'Education', 'Marital status',
       'Mode of Interview', 'Pre Interview Check',
       'Fluency in English based on introduction', 'Acquaintance and Referral',
       'Candidate Status', 'Last Fixed CTC (lakhs) ', 'Currently Employed',
       'Experienced candidate - (Experience in months)',
       'Experienced Candidate (Nature of work)', 'What was the type of Role?',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role acceptance', 'Interview Verdict',
       'Candidate is willing to relocate',
       'Role Location to be given to the candidate', 'Comments',
       'RedFlags Comments in Interview',
       'Confidence based on Introduction (Engl

In [16]:
df['Age'].unique()

array(['25', '29', '27', '22', '23', '24', '26', '21', '30', '20', '31',
       '28', '32+', '32'], dtype=object)

In [17]:
df['Age'] = df['Age'].astype(str).str.replace('+', '', regex=False).astype(int)

In [18]:
df['Age'].unique()

array([25, 29, 27, 22, 23, 24, 26, 21, 30, 20, 31, 28, 32])

In [19]:
bins = [18, 22, 25, 28, 32, 35, float('inf')]
labels = ['18-22', '23-25', '26-28', '29-32', '33-35', '35+']

In [20]:
df['Age'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

In [21]:
df.isnull().sum()

Name                                                                                                                       0
Age                                                                                                                        0
Gender                                                                                                                     0
Education                                                                                                                  0
Marital status                                                                                                             0
Mode of Interview                                                                                                          0
Pre Interview Check                                                                                                        0
Fluency in English based on introduction                                                                                   0


In [22]:
df['Age'].unique()

['23-25', '29-32', '26-28', '18-22']
Categories (6, object): ['18-22' < '23-25' < '26-28' < '29-32' < '33-35' < '35+']

In [23]:
df.columns

Index(['Name', 'Age', 'Gender', 'Education', 'Marital status',
       'Mode of Interview', 'Pre Interview Check',
       'Fluency in English based on introduction', 'Acquaintance and Referral',
       'Candidate Status', 'Last Fixed CTC (lakhs) ', 'Currently Employed',
       'Experienced candidate - (Experience in months)',
       'Experienced Candidate (Nature of work)', 'What was the type of Role?',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role acceptance', 'Interview Verdict',
       'Candidate is willing to relocate',
       'Role Location to be given to the candidate', 'Comments',
       'RedFlags Comments in Interview',
       'Confidence based on Introduction (Engl

In [24]:
new_df = df[['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score',]]

In [25]:
new_df.head(1)

Unnamed: 0,Confidence based on Introduction (English).1,Confidence based on the topic given .1,Confidence Based on the PPT Question.1,Confidence based on the sales scenario.1,Structured Thinking (In regional only).1,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score
0,3,2,3,3,2,3,2,1,1.0,1.0,11,7,3,42


new_df.info()

In [26]:
df.shape

(18681, 40)

In [27]:
df['id'] = df.index

In [28]:
df.head()

Unnamed: 0,Name,Age,Gender,Education,Marital status,Mode of Interview,Pre Interview Check,Fluency in English based on introduction,Acquaintance and Referral,Candidate Status,...,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not,id
0,parida,23-25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Experienced in non client facing(equal to or m...,...,2,1,1.0,1.0,11,7,3,42,No,0
1,shreej,29-32,Female,BSc or MSc,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Lateral(2021 and before with (less than 6 mont...,...,3,3,3.0,3.0,12,9,9,60,No,1
2,ms6744,26-28,Female,B.E / B-Tech,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Yes,Fresher(only 2022 grad),...,3,3,1.0,3.0,10,9,7,52,Yes,2
3,aswalu,18-22,Male,B.E / B-Tech,Unmarried,Laptop,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Fresher(only 2022 grad),...,3,3,3.0,3.0,12,9,9,60,Yes,3
4,aniket,18-22,Male,BA/MA,Unmarried,Mobile,Proceed with the Interview,Taking gaps while speaking due to lack of cont...,No,Fresher(only 2022 grad),...,1,3,1.0,3.0,10,5,7,44,Not Joined,4


# New Dataframe for only Numerical Operations (excluding Name column from it)

In [29]:
numerical_df = df.copy()
numerical_df.drop(['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate'],
                  axis=1,inplace=True)

# Data Scaling
The StandardScaler from sklearn.preprocessing transforms data to have:

Mean = 0 ,
Standard Deviation = 1

In [30]:
scaler = StandardScaler()
scaler

In [31]:
temp_list = ['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score']

In [32]:
numerical_df[temp_list] = scaler.fit_transform(numerical_df[temp_list])
print("Means after scaling:\n", numerical_df[temp_list].mean().round(5))
print("Standard deviations after scaling:\n", numerical_df[temp_list].std().round(5))

Means after scaling:
 Confidence based on Introduction (English).1      -0.0
Confidence based on the topic given  .1           -0.0
Confidence Based on the PPT Question.1            -0.0
Confidence based on the sales scenario.1          -0.0
Structured Thinking (In regional only).1           0.0
Structured Thinking Based on the PPT Question.1   -0.0
Structured Thinking( Call pitch).1                -0.0
Regional fluency based on the topic given  .1      0.0
Regional fluency Based on the PPT Question.1      -0.0
Regional fluency based on the  sales scenario.1   -0.0
Confidence Score                                   0.0
Structured Thinking Score                         -0.0
Regional Fluency Score                             0.0
Total Score                                        0.0
dtype: float64
Standard deviations after scaling:
 Confidence based on Introduction (English).1       1.00003
Confidence based on the topic given  .1            1.00003
Confidence Based on the PPT Question.1 

In [33]:
numerical_df.head()

Unnamed: 0,Age,Gender,Education,Marital status,Mode of Interview,Pre Interview Check,Fluency in English based on introduction,Acquaintance and Referral,Candidate Status,Last Fixed CTC (lakhs),...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not
0,23-25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Experienced in non client facing(equal to or m...,5-5.99,...,1.020654,-0.424411,-1.092966,-1.063625,-1.043289,0.95149,-0.022276,-1.320677,-0.198863,No
1,29-32,Female,BSc or MSc,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Lateral(2021 and before with (less than 6 mont...,Fresher,...,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342,No
2,26-28,Female,B.E / B-Tech,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Yes,Fresher(only 2022 grad),Fresher,...,1.020654,1.006713,0.979581,-1.063625,1.060405,0.487893,1.256178,0.402709,0.750139,Yes
3,18-22,Male,B.E / B-Tech,Unmarried,Laptop,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Fresher(only 2022 grad),Fresher,...,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342,Yes
4,18-22,Male,BA/MA,Unmarried,Mobile,Proceed with the Interview,Taking gaps while speaking due to lack of cont...,No,Fresher(only 2022 grad),Fresher,...,-0.55603,-1.855535,0.979581,-1.063625,1.060405,0.487893,-1.300729,0.402709,-0.009063,Not Joined


In [34]:
numerical_df.shape

(18681, 32)

# Data Encoding

In [35]:
numerical_df['Role acceptance'].value_counts()

Role acceptance
Emphatic Yes                                        12456
Yes : Think and says yes.(Shows some hesitation)     5475
No                                                    750
Name: count, dtype: int64

In [36]:
transformer = ColumnTransformer(transformers = [
    ('t1', OneHotEncoder(sparse_output=False, drop='first'), ['Gender', 'Experienced Candidate (Nature of work)', 'What was the type of Role?', 'Whether joined the company or not', 'Currently Employed', 'Marital status', 'Mode of Interview', 'Pre Interview Check', 'Fluency in English based on introduction', 'Acquaintance and Referral', 'Candidate Status', 'Education']),
    ('t2', OrdinalEncoder(categories=[['Fresher','0-1.99','2-2.99','3-3.99','4-4.99','5-5.99','6-6.99','7+']]), ['Last Fixed CTC (lakhs) ']),
    ('t3', OrdinalEncoder(categories=[['No - Want Specific Centre Location Only','Yes - Anywhere Within a City','Yes - Anywhere Within a State','Yes - Anywhere in PAN India']]), ['Candidate is willing to relocate']),
    ('t4', OrdinalEncoder(categories=[['Reject','Borderline Reject','Borderline Select','Select','Premium Select']]), ['Interview Verdict']),
    ('t5', OrdinalEncoder(categories=[['No','Yes : Think and says yes.(Shows some hesitation)','Emphatic Yes']]), ['Role acceptance']),
    ('t6', OrdinalEncoder(categories=[['Fresher(<6 months)','6-11.99 Months','12-17.99 Months','18-23.99 Months','24-29.99 Months','30-35.99 Months','36-47.99 Months','48+ Months']]), ['Experienced candidate - (Experience in months)']),
    ('t7', OrdinalEncoder(categories=[['18-22', '23-25', '26-28', '29-32', '33-35', '35+']]), ['Age'])
], remainder='passthrough')

In [37]:
numerical_df = transformer.fit_transform(numerical_df)

In [38]:
type(numerical_df)

numpy.ndarray

In [39]:
features_names = transformer.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [40]:
numerical_df.shape

(18681, 1900)

In [41]:
numerical_df.head(2)

Unnamed: 0,t1__Gender_Male,t1__Experienced Candidate (Nature of work)_ DISPATCHER IN US COMPANY,t1__Experienced Candidate (Nature of work)_ Laboratory Attendant,t1__Experienced Candidate (Nature of work)_ Network Engineer,t1__Experienced Candidate (Nature of work)_ no,t1__Experienced Candidate (Nature of work)_ proctor,t1__Experienced Candidate (Nature of work)_-,t1__Experienced Candidate (Nature of work)_.,t1__Experienced Candidate (Nature of work)_/,t1__Experienced Candidate (Nature of work)_10 lakhs lone,...,remainder__Structured Thinking (In regional only).1,remainder__Structured Thinking Based on the PPT Question.1,remainder__Structured Thinking( Call pitch).1,remainder__Regional fluency based on the topic given .1,remainder__Regional fluency Based on the PPT Question.1,remainder__Regional fluency based on the sales scenario.1,remainder__Confidence Score,remainder__Structured Thinking Score,remainder__Regional Fluency Score,remainder__Total Score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.618598,1.020654,-0.424411,-1.092966,-1.063625,-1.043289,0.95149,-0.022276,-1.320677,-0.198863
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.985514,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342


# Principal Component Analysis(PCA) for Dimensionality Reduction
**Required to reduce the number of features**

In [42]:
pca = PCA(n_components=0.95)
numerical_df = pca.fit_transform(numerical_df)

In [43]:
type(numerical_df)

numpy.ndarray

In [44]:
 features_names = pca.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [45]:
type(numerical_df)

pandas.core.frame.DataFrame

In [46]:
numerical_df.shape

(18681, 23)

In [47]:
numerical_df.head()

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21,pca22
0,-0.052781,1.82304,-1.000861,-1.640338,2.153899,-0.90488,-3.322327,2.383088,1.147654,1.216667,...,-1.002317,-0.186995,1.102233,0.585393,0.070028,-0.435301,0.148319,-0.219776,-0.66465,-0.621672
1,2.923109,-2.27258,1.616185,0.522408,1.446041,1.329056,-1.950287,-1.941437,2.152077,0.779589,...,-0.057405,-0.829778,0.279894,-0.026051,0.470935,0.000236,-0.025792,0.042607,-0.017964,-0.339394
2,2.164853,-2.073269,-1.094714,-0.267596,-0.98576,0.299652,1.724536,-0.45322,1.248881,0.539528,...,-1.112322,0.481271,-0.121442,0.54224,0.17653,-0.313344,-0.809657,0.663573,0.696053,-0.599852
3,3.906563,-3.405313,-0.106782,1.147585,0.3109,-0.251753,0.245191,0.203074,-0.415105,-0.740198,...,-0.019635,0.724503,-0.090785,-0.022621,0.262717,-0.189576,-0.089993,-0.359328,0.102563,0.083939
4,-0.944235,-1.526629,1.590621,-1.558568,0.460823,-1.567873,1.634695,-0.184716,-0.024793,-0.146011,...,-1.064223,-0.125709,0.136476,0.183655,-0.484256,0.297118,-0.582925,-0.026695,-0.152585,0.173361


In [48]:
numerical_df.isnull().sum()

pca0     0
pca1     0
pca2     0
pca3     0
pca4     0
pca5     0
pca6     0
pca7     0
pca8     0
pca9     0
pca10    0
pca11    0
pca12    0
pca13    0
pca14    0
pca15    0
pca16    0
pca17    0
pca18    0
pca19    0
pca20    0
pca21    0
pca22    0
dtype: int64

In [49]:
df = df[['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate']]

In [50]:
df.head()

Unnamed: 0,id,Name,Comments,RedFlags Comments in Interview,Call-pitch Elements used during the call Sales Scenario,"But, my child's exam are going on now, so we will keep the counselling session after the exams get over",Let me discuss it with my child,Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.,Role Location to be given to the candidate
0,0,parida,"Lipsa is 25 female from Orissa, Family BG - Fa...",At least Graduated ( not 12th Pass or diploma ...,Purpose of Call (Book a Counselling Session),Urgency using Time,None of the above,None of the above,Bangalore
1,1,shreej,29 yo / female / unmarried / MSc Finance UK 20...,Not Rehire (CDT have not joined byjus in sales...,"Introduction (Self Intro,Company Name), Purpos...",Asking Questions,None of the above,Non chargeable session,Delhi
2,2,ms6744,nm,Not working currently and ready to join the R3...,"Introduction (Self Intro,Company Name)",Urgency using Time,Decision Making: Major decision of child futur...,Non chargeable session,Chennai
3,3,aswalu,Final Interview Done 22YRS //UTTARAKHAND // F...,Not Rehire (CDT have not joined byjus in sales...,"Introduction (Self Intro,Company Name), Purpos...","Urgency using Time, Urgency using situation",None of the above,Value creation(Nothing comes for free),Delhi
4,4,aniket,"Aniket is 22 male from Bhopal, BA Grad 21, Fam...","Age is below 32.11, At least Graduated ( not 1...",None of the Above,Urgency by creating counsellor Hype,Anticipation of objection from child's end:,Value creation(Nothing comes for free),Bangalore


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18681 entries, 0 to 21255
Data columns (total 9 columns):
 #   Column                                                                                                                   Non-Null Count  Dtype 
---  ------                                                                                                                   --------------  ----- 
 0   id                                                                                                                       18681 non-null  int64 
 1   Name                                                                                                                     18681 non-null  object
 2   Comments                                                                                                                 18681 non-null  object
 3   RedFlags Comments in Interview                                                                                           18681 non-null  object
 4 

In [52]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x:x.split())

In [53]:
df.head()

Unnamed: 0,id,Name,Comments,RedFlags Comments in Interview,Call-pitch Elements used during the call Sales Scenario,"But, my child's exam are going on now, so we will keep the counselling session after the exams get over",Let me discuss it with my child,Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.,Role Location to be given to the candidate
0,0,[parida],"[Lipsa, is, 25, female, from, Orissa,, Family,...","[At, least, Graduated, (, not, 12th, Pass, or,...","[Purpose, of, Call, (Book, a, Counselling, Ses...","[Urgency, using, Time]","[None, of, the, above]","[None, of, the, above]",[Bangalore]
1,1,[shreej],"[29, yo, /, female, /, unmarried, /, MSc, Fina...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Asking, Questions]","[None, of, the, above]","[Non, chargeable, session]",[Delhi]
2,2,[ms6744],[nm],"[Not, working, currently, and, ready, to, join...","[Introduction, (Self, Intro,Company, Name)]","[Urgency, using, Time]","[Decision, Making:, Major, decision, of, child...","[Non, chargeable, session]",[Chennai]
3,3,[aswalu],"[Final, Interview, Done, 22YRS, //UTTARAKHAND,...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Urgency, using, Time,, Urgency, using, situat...","[None, of, the, above]","[Value, creation(Nothing, comes, for, free)]",[Delhi]
4,4,[aniket],"[Aniket, is, 22, male, from, Bhopal,, BA, Grad...","[Age, is, below, 32.11,, At, least, Graduated,...","[None, of, the, Above]","[Urgency, by, creating, counsellor, Hype]","[Anticipation, of, objection, from, child's, e...","[Value, creation(Nothing, comes, for, free)]",[Bangalore]


In [54]:
df.columns

Index(['id', 'Name', 'Comments', 'RedFlags Comments in Interview',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role Location to be given to the candidate'],
      dtype='object')

In [55]:
df['tags'] = df['Name'] + df['Comments'] + df['RedFlags Comments in Interview'] + df['Call-pitch Elements used during the call Sales Scenario'] + df["But, my child's exam are going on now, so we will keep the counselling session after the exams get over"] + df['Let me discuss it with my child'] + df["Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app."] + df['Role Location to be given to the candidate']

In [56]:
df.head()

Unnamed: 0,id,Name,Comments,RedFlags Comments in Interview,Call-pitch Elements used during the call Sales Scenario,"But, my child's exam are going on now, so we will keep the counselling session after the exams get over",Let me discuss it with my child,Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.,Role Location to be given to the candidate,tags
0,0,[parida],"[Lipsa, is, 25, female, from, Orissa,, Family,...","[At, least, Graduated, (, not, 12th, Pass, or,...","[Purpose, of, Call, (Book, a, Counselling, Ses...","[Urgency, using, Time]","[None, of, the, above]","[None, of, the, above]",[Bangalore],"[parida, Lipsa, is, 25, female, from, Orissa,,..."
1,1,[shreej],"[29, yo, /, female, /, unmarried, /, MSc, Fina...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Asking, Questions]","[None, of, the, above]","[Non, chargeable, session]",[Delhi],"[shreej, 29, yo, /, female, /, unmarried, /, M..."
2,2,[ms6744],[nm],"[Not, working, currently, and, ready, to, join...","[Introduction, (Self, Intro,Company, Name)]","[Urgency, using, Time]","[Decision, Making:, Major, decision, of, child...","[Non, chargeable, session]",[Chennai],"[ms6744, nm, Not, working, currently, and, rea..."
3,3,[aswalu],"[Final, Interview, Done, 22YRS, //UTTARAKHAND,...","[Not, Rehire, (CDT, have, not, joined, byjus, ...","[Introduction, (Self, Intro,Company, Name),, P...","[Urgency, using, Time,, Urgency, using, situat...","[None, of, the, above]","[Value, creation(Nothing, comes, for, free)]",[Delhi],"[aswalu, Final, Interview, Done, 22YRS, //UTTA..."
4,4,[aniket],"[Aniket, is, 22, male, from, Bhopal,, BA, Grad...","[Age, is, below, 32.11,, At, least, Graduated,...","[None, of, the, Above]","[Urgency, by, creating, counsellor, Hype]","[Anticipation, of, objection, from, child's, e...","[Value, creation(Nothing, comes, for, free)]",[Bangalore],"[aniket, Aniket, is, 22, male, from, Bhopal,, ..."


In [57]:
df.loc[1,'tags']

['shreej',
 '29',
 'yo',
 '/',
 'female',
 '/',
 'unmarried',
 '/',
 'MSc',
 'Finance',
 'UK',
 '2022',
 '/',
 'recently',
 'relocated',
 'to',
 'India',
 'just',
 '20',
 'days',
 'ago',
 '/',
 'Odissa',
 '/',
 'father',
 '-',
 'doctor',
 '/',
 'bro',
 '-',
 'software',
 'engineer',
 '/',
 'pursuing',
 'internship',
 'in',
 'investment',
 'banking',
 '-',
 'ends',
 'in',
 'next',
 'week',
 '/',
 'worked',
 'as',
 'a',
 'teacher',
 '/',
 'good',
 'story',
 'teller',
 '-',
 'fluent',
 'in',
 'hindi',
 'and',
 'english',
 '-',
 'good',
 'energy',
 '/',
 'also',
 'prepared',
 'for',
 'UPSC',
 'and',
 'GMAT',
 '-',
 'purchased',
 'a',
 'course',
 'from',
 "byju's",
 'earlier',
 '/',
 'cracked',
 'GMAT',
 'and',
 'got',
 'business',
 'school',
 'as',
 'well/asking',
 'ques',
 'in',
 'sales',
 'scenario',
 '-',
 '2/5',
 'Not',
 'Rehire',
 '(CDT',
 'have',
 'not',
 'joined',
 'byjus',
 'in',
 'sales',
 'before),',
 'Not',
 'Interviewed',
 'in',
 'Byjus',
 'in',
 'sales',
 'role',
 'in',
 'last

In [58]:
df = df[['id','Name','tags']]

In [59]:
df.head(1)

Unnamed: 0,id,Name,tags
0,0,[parida],"[parida, Lipsa, is, 25, female, from, Orissa,,..."


## Stemming
**Note: Use Lemmatization for more accuracy**

To normalize words and reduce them to their root forms, we will apply **stemming**. This helps in handling variations of words and improves text processing efficiency for machine learning models.  
(e.g., "running" → "run")

In [60]:
ps = PorterStemmer()
ps

<PorterStemmer>

In [61]:
stop_words = set(stopwords.words('english'))

In [62]:
def stem(text):
    y = []

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)    

In [63]:
''' Applying Stemming '''

df['tags'] = df['tags'].apply(stem)

In [64]:
df.head()

Unnamed: 0,id,Name,tags
0,0,[parida],"parida lipsa is 25 femal from orissa, famili b..."
1,1,[shreej],shreej 29 yo / femal / unmarri / msc financ uk...
2,2,[ms6744],ms6744 nm not work current and readi to join t...
3,3,[aswalu],aswalu final interview done 22yr //uttarakhand...
4,4,[aniket],"aniket aniket is 22 male from bhopal, ba grad ..."


In [65]:
df.loc[1,'tags']

"shreej 29 yo / femal / unmarri / msc financ uk 2022 / recent reloc to india just 20 day ago / odissa / father - doctor / bro - softwar engin / pursu internship in invest bank - end in next week / work as a teacher / good stori teller - fluent in hindi and english - good energi / also prepar for upsc and gmat - purchas a cours from byju' earlier / crack gmat and got busi school as well/ask que in sale scenario - 2/5 not rehir (cdt have not join byju in sale before), not interview in byju in sale role in last 90days, laptop and wifi are available, age is below 32.11, at least graduat ( not 12th pass or diploma or final year student), will to reloc at given locat for ssp, comfort with the stipend & allow dure training, not work current and readi to join the r3 process introduct (self intro,compani name), purpos of call (book a counsel session), need gener - by ask que like student class, perform etc.. ask question none of the abov non chargeabl session delhi"

In [66]:
df.isnull().sum()

id      0
Name    0
tags    0
dtype: int64

In [67]:
def stopwords_removal(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [68]:
''' Applying Stopwords Removal '''

df['tags'] = df['tags'].apply(stopwords_removal)

In [69]:
df.head(2)

Unnamed: 0,id,Name,tags
0,0,[parida],"parida lipsa 25 femal orissa , famili bg - fat..."
1,1,[shreej],shreej 29 yo / femal / unmarri / msc financ uk...


In [70]:
df.loc[1,'tags']

"shreej 29 yo / femal / unmarri / msc financ uk 2022 / recent reloc india 20 day ago / odissa / father - doctor / bro - softwar engin / pursu internship invest bank - end next week / work teacher / good stori teller - fluent hindi english - good energi / also prepar upsc gmat - purchas cours byju ' earlier / crack gmat got busi school well/ask que sale scenario - 2/5 rehir ( cdt join byju sale ) , interview byju sale role last 90days , laptop wifi available , age 32.11 , least graduat ( 12th pass diploma final year student ) , reloc given locat ssp , comfort stipend & allow dure training , work current readi join r3 process introduct ( self intro , compani name ) , purpos call ( book counsel session ) , need gener - ask que like student class , perform etc .. ask question none abov non chargeabl session delhi"

In [71]:
def remove_duplicates(text):
    words = text.split()
    seen = set()
    unique_words = []

    for word in words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)

    return " ".join(unique_words)        

In [72]:
df['tags'] = df['tags'].apply(remove_duplicates)

In [73]:
df['tags'] = df['tags'].str.replace('/', '', regex=False)

In [74]:
df.loc[1,'tags']

"shreej 29 yo  femal unmarri msc financ uk 2022 recent reloc india 20 day ago odissa father - doctor bro softwar engin pursu internship invest bank end next week work teacher good stori teller fluent hindi english energi also prepar upsc gmat purchas cours byju ' earlier crack got busi school wellask que sale scenario 25 rehir ( cdt join ) , interview role last 90days laptop wifi available age 32.11 least graduat 12th pass diploma final year student given locat ssp comfort stipend & allow dure training current readi r3 process introduct self intro compani name purpos call book counsel session need gener ask like class perform etc .. question none abov non chargeabl delhi"

In [75]:
df = df.join(numerical_df, how='inner')

In [76]:
df.shape

(16386, 26)

In [77]:
df.head(30)

Unnamed: 0,id,Name,tags,pca0,pca1,pca2,pca3,pca4,pca5,pca6,...,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21,pca22
0,0,[parida],"parida lipsa 25 femal orissa , famili bg - fat...",-0.052781,1.82304,-1.000861,-1.640338,2.153899,-0.90488,-3.322327,...,-1.002317,-0.186995,1.102233,0.585393,0.070028,-0.435301,0.148319,-0.219776,-0.66465,-0.621672
1,1,[shreej],shreej 29 yo femal unmarri msc financ uk 2022...,2.923109,-2.27258,1.616185,0.522408,1.446041,1.329056,-1.950287,...,-0.057405,-0.829778,0.279894,-0.026051,0.470935,0.000236,-0.025792,0.042607,-0.017964,-0.339394
2,2,[ms6744],ms6744 nm work current readi join r3 process i...,2.164853,-2.073269,-1.094714,-0.267596,-0.98576,0.299652,1.724536,...,-1.112322,0.481271,-0.121442,0.54224,0.17653,-0.313344,-0.809657,0.663573,0.696053,-0.599852
3,3,[aswalu],aswalu final interview done 22yr uttarakhand ...,3.906563,-3.405313,-0.106782,1.147585,0.3109,-0.251753,0.245191,...,-0.019635,0.724503,-0.090785,-0.022621,0.262717,-0.189576,-0.089993,-0.359328,0.102563,0.083939
4,4,[aniket],"aniket 22 male bhopal , ba grad 21 famili bg -...",-0.944235,-1.526629,1.590621,-1.558568,0.460823,-1.567873,1.634695,...,-1.064223,-0.125709,0.136476,0.183655,-0.484256,0.297118,-0.582925,-0.026695,-0.152585,0.173361
5,5,[faizal],faizal hd =25 ba - currenlti mba distanc say ...,-5.579654,1.342818,1.802116,-1.125968,-0.23168,-1.348032,0.860554,...,-0.929862,-0.408741,-0.051392,0.069074,0.323925,-1.018994,-0.63174,-0.394689,-0.914739,0.016413
6,6,[ravatn],ravatn bed 2021 bsc 2019 bglr - dono place it...,0.357986,-1.874188,0.130395,1.832301,-0.384434,-2.59292,-0.195161,...,1.08893,-0.580183,0.476081,-0.588453,-0.091987,-0.341109,-0.51278,0.039848,-0.247332,-0.548941
7,7,[gornal],gornal m.tech - 2022- fresher dad- farmer mom ...,-4.107792,-0.312133,-0.042394,0.061637,0.591363,-0.86136,-1.175065,...,0.377448,0.421239,-0.386704,0.76121,0.204665,-0.886725,0.201377,-0.709418,0.223291,0.346213
8,8,[Upkarr],"upkarr intro - eng comm guard gap , littl mti ...",-4.127144,-0.148558,0.721339,-0.615416,0.788534,0.103495,-0.389031,...,-0.334093,-0.089614,-0.952717,-0.831651,0.820648,0.396035,0.249185,0.217918,-0.493086,-0.559958
9,9,[moizju],moizju 25-bcom -2019- 1 yr exp sale -3.5lpa hy...,1.064749,1.544065,-1.415297,0.295471,0.678079,0.813304,1.556018,...,0.043823,-0.319144,-0.313713,0.065939,0.110292,0.432163,0.177648,-0.755487,0.143091,0.323624


In [78]:
df['pca0'].unique()

array([-0.05278134,  2.92310863,  2.16485291, ...,  6.97187506,
        4.13991125,  1.47508752], shape=(16144,))

In [79]:
df.isnull().sum()

id       0
Name     0
tags     0
pca0     0
pca1     0
pca2     0
pca3     0
pca4     0
pca5     0
pca6     0
pca7     0
pca8     0
pca9     0
pca10    0
pca11    0
pca12    0
pca13    0
pca14    0
pca15    0
pca16    0
pca17    0
pca18    0
pca19    0
pca20    0
pca21    0
pca22    0
dtype: int64

<!-- ## Word Embeddings
**Note: Use Contextual Embeddings for More Accuracy**

To represent words in a numerical format while preserving their meaning and relationships, we will apply **word embeddings**. This helps in capturing semantic similarities and improving machine learning model performance.
(e.g., "king" → similar to "queen" but different from "apple").

['stemming', 'reduces', 'words', 'to', 'root']  
[34402, 10838, 2616, 2000, 7771] -->

## Hugging Face Transformers

In [87]:
neural_df = df.copy()
neural_df.drop('Name', axis=1, inplace=True)
neural_df.head(1)

Unnamed: 0,id,tags,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,...,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21,pca22
0,0,"parida lipsa 25 femal orissa , famili bg - fat...",-0.052781,1.82304,-1.000861,-1.640338,2.153899,-0.90488,-3.322327,2.383088,...,-1.002317,-0.186995,1.102233,0.585393,0.070028,-0.435301,0.148319,-0.219776,-0.66465,-0.621672


In [84]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [88]:
def get_text_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return embeddings.squeeze().numpy()    

In [None]:
neural_df['tags'] = neural_df['tags'].apply(get_text_embedding)