In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Torch 
import torch
import torch.nn as nn

# Scikit-Learn
# Data Encoding and Scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

# Natural Language Processing(NLP)
from nltk.stem.porter import PorterStemmer

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
df = pd.read_csv('IntervieweeDataset.csv')

In [4]:
df.head(1)

Unnamed: 0,Name,Age,Gender,Type of Graduation/Post Graduation,Marital status,Mode of interview given by candidate?,Pre Interview Check,Fluency in English based on introduction,Confidence based on Introduction (English),Confidence based on the topic given,...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not\n
0,parida,25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Impactful - Good confidence throughout the Int...,Guarded Confidence - Confident in some areas a...,...,3,2,1,1.0,1.0,11,7,3,42,No


In [5]:
df.shape

(21256, 52)

In [6]:
df.dropna(inplace=True)

In [7]:
df.shape

(18681, 52)

In [8]:
df.drop(df.iloc[:,8:18], axis=1, inplace=True)

In [9]:
df.drop(['Does the candidate has mother tongue influence while speaking english.', 'How many slides candidate have submitted in PPT?'], axis=1, inplace=True)

In [10]:
df.shape

(18681, 40)

In [11]:
df.rename(columns={'Type of Graduation/Post Graduation':'Education', 'Mode of interview given by candidate?':'Mode of Interview', 'Has acquaintance in Company and has spoken to him/her before applying?':'Acquaintance and Referral'}, inplace=True)

In [12]:
df.rename(columns={'Whether joined the company or not\n': 'Whether joined the company or not', 'What was the type of Role?\t': 'What was the type of Role?'}, inplace=True)

In [13]:
df.rename(columns={"But, my child's exam are going on now, so we will keep the counselling session after the exams get over.(Time: Favourable pitch: Counsellor hype)":"But, my child's exam are going on now, so we will keep the counselling session after the exams get over"}, inplace=True)

In [14]:
df.columns

Index(['Name', 'Age', 'Gender', 'Education', 'Marital status',
       'Mode of Interview', 'Pre Interview Check',
       'Fluency in English based on introduction', 'Acquaintance and Referral',
       'Candidate Status', 'Last Fixed CTC (lakhs) ', 'Currently Employed',
       'Experienced candidate - (Experience in months)',
       'Experienced Candidate (Nature of work)', 'What was the type of Role?',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role acceptance', 'Interview Verdict',
       'Candidate is willing to relocate',
       'Role Location to be given to the candidate', 'Comments',
       'RedFlags Comments in Interview',
       'Confidence based on Introduction (Engl

In [15]:
df['Age'].unique()

array(['25', '29', '27', '22', '23', '24', '26', '21', '30', '20', '31',
       '28', '32+', '32'], dtype=object)

In [16]:
df['Age'] = df['Age'].astype(str).str.replace('+', '', regex=False).astype(int)

In [17]:
df['Age'].unique()

array([25, 29, 27, 22, 23, 24, 26, 21, 30, 20, 31, 28, 32])

In [18]:
bins = [18, 22, 25, 28, 32, 35, float('inf')]
labels = ['18-22', '23-25', '26-28', '29-32', '33-35', '35+']

In [19]:
df['Age'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

In [20]:
df.isnull().sum()

Name                                                                                                                       0
Age                                                                                                                        0
Gender                                                                                                                     0
Education                                                                                                                  0
Marital status                                                                                                             0
Mode of Interview                                                                                                          0
Pre Interview Check                                                                                                        0
Fluency in English based on introduction                                                                                   0


In [21]:
df['Age'].unique()

['23-25', '29-32', '26-28', '18-22']
Categories (6, object): ['18-22' < '23-25' < '26-28' < '29-32' < '33-35' < '35+']

In [22]:
df.columns

Index(['Name', 'Age', 'Gender', 'Education', 'Marital status',
       'Mode of Interview', 'Pre Interview Check',
       'Fluency in English based on introduction', 'Acquaintance and Referral',
       'Candidate Status', 'Last Fixed CTC (lakhs) ', 'Currently Employed',
       'Experienced candidate - (Experience in months)',
       'Experienced Candidate (Nature of work)', 'What was the type of Role?',
       'Call-pitch Elements used during the call Sales Scenario',
       'But, my child's exam are going on now, so we will keep the counselling session after the exams get over',
       'Let me discuss it with my child',
       'Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.',
       'Role acceptance', 'Interview Verdict',
       'Candidate is willing to relocate',
       'Role Location to be given to the candidate', 'Comments',
       'RedFlags Comments in Interview',
       'Confidence based on Introduction (Engl

In [23]:
new_df = df[['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score',]]

In [24]:
new_df.head(1)

Unnamed: 0,Confidence based on Introduction (English).1,Confidence based on the topic given .1,Confidence Based on the PPT Question.1,Confidence based on the sales scenario.1,Structured Thinking (In regional only).1,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score
0,3,2,3,3,2,3,2,1,1.0,1.0,11,7,3,42


new_df.info()

In [25]:
df.shape

(18681, 40)

In [26]:
df['id'] = df.index

In [27]:
df.head()

Unnamed: 0,Name,Age,Gender,Education,Marital status,Mode of Interview,Pre Interview Check,Fluency in English based on introduction,Acquaintance and Referral,Candidate Status,...,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not,id
0,parida,23-25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Experienced in non client facing(equal to or m...,...,2,1,1.0,1.0,11,7,3,42,No,0
1,shreej,29-32,Female,BSc or MSc,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Lateral(2021 and before with (less than 6 mont...,...,3,3,3.0,3.0,12,9,9,60,No,1
2,ms6744,26-28,Female,B.E / B-Tech,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Yes,Fresher(only 2022 grad),...,3,3,1.0,3.0,10,9,7,52,Yes,2
3,aswalu,18-22,Male,B.E / B-Tech,Unmarried,Laptop,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Fresher(only 2022 grad),...,3,3,3.0,3.0,12,9,9,60,Yes,3
4,aniket,18-22,Male,BA/MA,Unmarried,Mobile,Proceed with the Interview,Taking gaps while speaking due to lack of cont...,No,Fresher(only 2022 grad),...,1,3,1.0,3.0,10,5,7,44,Not Joined,4


# New Dataframe for only Numerical Operations (excluding Name column from it)

In [28]:
numerical_df = df.copy()
numerical_df.drop(['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate'],
                  axis=1,inplace=True)

# Data Scaling
The StandardScaler from sklearn.preprocessing transforms data to have:

Mean = 0 ,
Standard Deviation = 1

In [29]:
scaler = StandardScaler()
scaler

In [30]:
temp_list = ['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score']

In [31]:
numerical_df[temp_list] = scaler.fit_transform(numerical_df[temp_list])
print("Means after scaling:\n", numerical_df[temp_list].mean().round(5))
print("Standard deviations after scaling:\n", numerical_df[temp_list].std().round(5))

Means after scaling:
 Confidence based on Introduction (English).1      -0.0
Confidence based on the topic given  .1           -0.0
Confidence Based on the PPT Question.1            -0.0
Confidence based on the sales scenario.1          -0.0
Structured Thinking (In regional only).1           0.0
Structured Thinking Based on the PPT Question.1   -0.0
Structured Thinking( Call pitch).1                -0.0
Regional fluency based on the topic given  .1      0.0
Regional fluency Based on the PPT Question.1      -0.0
Regional fluency based on the  sales scenario.1   -0.0
Confidence Score                                   0.0
Structured Thinking Score                         -0.0
Regional Fluency Score                             0.0
Total Score                                        0.0
dtype: float64
Standard deviations after scaling:
 Confidence based on Introduction (English).1       1.00003
Confidence based on the topic given  .1            1.00003
Confidence Based on the PPT Question.1 

In [32]:
numerical_df.head()

Unnamed: 0,Age,Gender,Education,Marital status,Mode of Interview,Pre Interview Check,Fluency in English based on introduction,Acquaintance and Referral,Candidate Status,Last Fixed CTC (lakhs),...,Structured Thinking Based on the PPT Question.1,Structured Thinking( Call pitch).1,Regional fluency based on the topic given .1,Regional fluency Based on the PPT Question.1,Regional fluency based on the sales scenario.1,Confidence Score,Structured Thinking Score,Regional Fluency Score,Total Score,Whether joined the company or not
0,23-25,Female,Masters in data science,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Experienced in non client facing(equal to or m...,5-5.99,...,1.020654,-0.424411,-1.092966,-1.063625,-1.043289,0.95149,-0.022276,-1.320677,-0.198863,No
1,29-32,Female,BSc or MSc,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Lateral(2021 and before with (less than 6 mont...,Fresher,...,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342,No
2,26-28,Female,B.E / B-Tech,Unmarried,Mobile,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,Yes,Fresher(only 2022 grad),Fresher,...,1.020654,1.006713,0.979581,-1.063625,1.060405,0.487893,1.256178,0.402709,0.750139,Yes
3,18-22,Male,B.E / B-Tech,Unmarried,Laptop,Proceed with the Interview,Able to speak sentences in a clear/coherent wa...,No,Fresher(only 2022 grad),Fresher,...,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342,Yes
4,18-22,Male,BA/MA,Unmarried,Mobile,Proceed with the Interview,Taking gaps while speaking due to lack of cont...,No,Fresher(only 2022 grad),Fresher,...,-0.55603,-1.855535,0.979581,-1.063625,1.060405,0.487893,-1.300729,0.402709,-0.009063,Not Joined


In [33]:
numerical_df.shape

(18681, 32)

# Data Encoding

In [34]:
numerical_df['Role acceptance'].value_counts()

Role acceptance
Emphatic Yes                                        12456
Yes : Think and says yes.(Shows some hesitation)     5475
No                                                    750
Name: count, dtype: int64

In [35]:
transformer = ColumnTransformer(transformers = [
    ('t1', OneHotEncoder(sparse_output=False, drop='first'), ['Gender', 'Experienced Candidate (Nature of work)', 'What was the type of Role?', 'Whether joined the company or not', 'Currently Employed', 'Marital status', 'Mode of Interview', 'Pre Interview Check', 'Fluency in English based on introduction', 'Acquaintance and Referral', 'Candidate Status', 'Education']),
    ('t2', OrdinalEncoder(categories=[['Fresher','0-1.99','2-2.99','3-3.99','4-4.99','5-5.99','6-6.99','7+']]), ['Last Fixed CTC (lakhs) ']),
    ('t3', OrdinalEncoder(categories=[['No - Want Specific Centre Location Only','Yes - Anywhere Within a City','Yes - Anywhere Within a State','Yes - Anywhere in PAN India']]), ['Candidate is willing to relocate']),
    ('t4', OrdinalEncoder(categories=[['Reject','Borderline Reject','Borderline Select','Select','Premium Select']]), ['Interview Verdict']),
    ('t5', OrdinalEncoder(categories=[['No','Yes : Think and says yes.(Shows some hesitation)','Emphatic Yes']]), ['Role acceptance']),
    ('t6', OrdinalEncoder(categories=[['Fresher(<6 months)','6-11.99 Months','12-17.99 Months','18-23.99 Months','24-29.99 Months','30-35.99 Months','36-47.99 Months','48+ Months']]), ['Experienced candidate - (Experience in months)']),
    ('t7', OrdinalEncoder(categories=[['18-22', '23-25', '26-28', '29-32', '33-35', '35+']]), ['Age'])
], remainder='passthrough')

In [36]:
numerical_df = transformer.fit_transform(numerical_df)

In [37]:
type(numerical_df)

numpy.ndarray

In [38]:
features_names = transformer.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [39]:
numerical_df.shape

(18681, 1900)

In [40]:
numerical_df.head(2)

Unnamed: 0,t1__Gender_Male,t1__Experienced Candidate (Nature of work)_ DISPATCHER IN US COMPANY,t1__Experienced Candidate (Nature of work)_ Laboratory Attendant,t1__Experienced Candidate (Nature of work)_ Network Engineer,t1__Experienced Candidate (Nature of work)_ no,t1__Experienced Candidate (Nature of work)_ proctor,t1__Experienced Candidate (Nature of work)_-,t1__Experienced Candidate (Nature of work)_.,t1__Experienced Candidate (Nature of work)_/,t1__Experienced Candidate (Nature of work)_10 lakhs lone,...,remainder__Structured Thinking (In regional only).1,remainder__Structured Thinking Based on the PPT Question.1,remainder__Structured Thinking( Call pitch).1,remainder__Regional fluency based on the topic given .1,remainder__Regional fluency Based on the PPT Question.1,remainder__Regional fluency based on the sales scenario.1,remainder__Confidence Score,remainder__Structured Thinking Score,remainder__Regional Fluency Score,remainder__Total Score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.618598,1.020654,-0.424411,-1.092966,-1.063625,-1.043289,0.95149,-0.022276,-1.320677,-0.198863
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.985514,1.020654,1.006713,0.979581,1.024503,1.060405,1.415087,1.256178,1.264403,1.509342


# Principal Component Analysis(PCA) for Dimensionality Reduction
**Required to reduce the number of features**

In [41]:
pca = PCA(n_components=0.95)
numerical_df = pca.fit_transform(numerical_df)

In [42]:
type(numerical_df)

numpy.ndarray

In [44]:
 features_names = pca.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [45]:
type(numerical_df)

pandas.core.frame.DataFrame

In [46]:
numerical_df.shape

(18681, 23)

In [47]:
numerical_df.head()

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca13,pca14,pca15,pca16,pca17,pca18,pca19,pca20,pca21,pca22
0,-0.052781,1.82304,-1.000861,-1.640338,2.153899,-0.90488,-3.322327,2.383088,1.147654,1.216667,...,-1.002317,-0.186995,1.102233,0.585393,0.070028,-0.435301,0.148319,-0.219776,-0.66465,-0.621672
1,2.923109,-2.27258,1.616185,0.522408,1.446041,1.329056,-1.950287,-1.941437,2.152077,0.779589,...,-0.057405,-0.829778,0.279894,-0.026051,0.470935,0.000236,-0.025792,0.042607,-0.017964,-0.339394
2,2.164853,-2.073269,-1.094714,-0.267596,-0.98576,0.299652,1.724536,-0.45322,1.248881,0.539528,...,-1.112322,0.481271,-0.121442,0.54224,0.17653,-0.313344,-0.809657,0.663573,0.696053,-0.599852
3,3.906563,-3.405313,-0.106782,1.147585,0.3109,-0.251753,0.245191,0.203074,-0.415105,-0.740198,...,-0.019635,0.724503,-0.090785,-0.022621,0.262717,-0.189576,-0.089993,-0.359328,0.102563,0.083939
4,-0.944235,-1.526629,1.590621,-1.558568,0.460823,-1.567873,1.634695,-0.184716,-0.024793,-0.146011,...,-1.064223,-0.125709,0.136476,0.183655,-0.484256,0.297118,-0.582925,-0.026695,-0.152585,0.173361


## Stemming  

To normalize words and reduce them to their root forms, we will apply **stemming**. This helps in handling variations of words and improves text processing efficiency for machine learning models.  
(e.g., "running" → "run")

In [48]:
ps = PorterStemmer()
ps

<PorterStemmer>

In [49]:
''' Function for Stemming '''
def stem(text):
    if isintance(text,str):
        words = words_tokenize(text.lower())
        stemmed_words = [ps.stem(word) for word in words]
        return " ".join(stemmed_words)
    return text    

In [None]:
df = df['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score',]