In [None]:
import gensim

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Torch 
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

# Scikit-Learn
# Data Encoding and Scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

# Natural Language Processing(NLP)
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
nltk.download('punkt')      # For tokenization
nltk.download('stopwords')  # For stopword removal

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
df = pd.read_csv('IntervieweeDataset.csv')

In [None]:
df.head(1)

In [None]:
df.shape

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.drop(df.iloc[:,8:18], axis=1, inplace=True)

In [None]:
df.drop(['Does the candidate has mother tongue influence while speaking english.', 'How many slides candidate have submitted in PPT?'], axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.rename(columns={'Type of Graduation/Post Graduation':'Education', 'Mode of interview given by candidate?':'Mode of Interview', 'Has acquaintance in Company and has spoken to him/her before applying?':'Acquaintance and Referral'}, inplace=True)

In [None]:
df.rename(columns={'Whether joined the company or not\n': 'Whether joined the company or not', 'What was the type of Role?\t': 'What was the type of Role?'}, inplace=True)

In [None]:
df.rename(columns={"But, my child's exam are going on now, so we will keep the counselling session after the exams get over.(Time: Favourable pitch: Counsellor hype)":"But, my child's exam are going on now, so we will keep the counselling session after the exams get over"}, inplace=True)

In [None]:
df.columns

In [None]:
df['Age'].unique()

In [None]:
df['Age'] = df['Age'].astype(str).str.replace('+', '', regex=False).astype(int)

In [None]:
df['Age'].unique()

In [None]:
bins = [18, 22, 25, 28, 32, 35, float('inf')]
labels = ['18-22', '23-25', '26-28', '29-32', '33-35', '35+']

In [None]:
df['Age'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

In [None]:
df.isnull().sum()

In [None]:
df['Age'].unique()

In [None]:
df.columns

In [None]:
new_df = df[['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score',]]

In [None]:
new_df.head(1)

new_df.info()

In [None]:
df.shape

In [None]:
df['id'] = df.index

In [None]:
df.head()

# New Dataframe for only Numerical Operations (excluding Name column from it)

In [None]:
numerical_df = df.copy()
numerical_df.drop(['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate'],
                  axis=1,inplace=True)

# Data Scaling
The StandardScaler from sklearn.preprocessing transforms data to have:

Mean = 0 ,
Standard Deviation = 1

In [None]:
scaler = StandardScaler()
scaler

In [None]:
temp_list = ['Confidence based on Introduction (English).1',
       'Confidence based on the topic given  .1',
       'Confidence Based on the PPT Question.1',
       'Confidence based on the sales scenario.1',
       'Structured Thinking (In regional only).1',
       'Structured Thinking Based on the PPT Question.1',
       'Structured Thinking( Call pitch).1',
       'Regional fluency based on the topic given  .1',
       'Regional fluency Based on the PPT Question.1',
       'Regional fluency based on the  sales scenario.1', 'Confidence Score',
       'Structured Thinking Score', 'Regional Fluency Score', 'Total Score']

In [None]:
numerical_df[temp_list] = scaler.fit_transform(numerical_df[temp_list])
print("Means after scaling:\n", numerical_df[temp_list].mean().round(5))
print("Standard deviations after scaling:\n", numerical_df[temp_list].std().round(5))

In [None]:
numerical_df.head()

In [None]:
numerical_df.shape

# Data Encoding

In [None]:
numerical_df['Role acceptance'].value_counts()

In [None]:
transformer = ColumnTransformer(transformers = [
    ('t1', OneHotEncoder(sparse_output=False, drop='first'), ['Gender', 'Experienced Candidate (Nature of work)', 'What was the type of Role?', 'Whether joined the company or not', 'Currently Employed', 'Marital status', 'Mode of Interview', 'Pre Interview Check', 'Fluency in English based on introduction', 'Acquaintance and Referral', 'Candidate Status', 'Education']),
    ('t2', OrdinalEncoder(categories=[['Fresher','0-1.99','2-2.99','3-3.99','4-4.99','5-5.99','6-6.99','7+']]), ['Last Fixed CTC (lakhs) ']),
    ('t3', OrdinalEncoder(categories=[['No - Want Specific Centre Location Only','Yes - Anywhere Within a City','Yes - Anywhere Within a State','Yes - Anywhere in PAN India']]), ['Candidate is willing to relocate']),
    ('t4', OrdinalEncoder(categories=[['Reject','Borderline Reject','Borderline Select','Select','Premium Select']]), ['Interview Verdict']),
    ('t5', OrdinalEncoder(categories=[['No','Yes : Think and says yes.(Shows some hesitation)','Emphatic Yes']]), ['Role acceptance']),
    ('t6', OrdinalEncoder(categories=[['Fresher(<6 months)','6-11.99 Months','12-17.99 Months','18-23.99 Months','24-29.99 Months','30-35.99 Months','36-47.99 Months','48+ Months']]), ['Experienced candidate - (Experience in months)']),
    ('t7', OrdinalEncoder(categories=[['18-22', '23-25', '26-28', '29-32', '33-35', '35+']]), ['Age'])
], remainder='passthrough')

In [None]:
numerical_df = transformer.fit_transform(numerical_df)

In [None]:
type(numerical_df)

In [None]:
features_names = transformer.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [None]:
numerical_df.shape

In [None]:
numerical_df.head(2)

# Principal Component Analysis(PCA) for Dimensionality Reduction
**Required to reduce the number of features**

In [None]:
pca = PCA(n_components=0.95)
numerical_df = pca.fit_transform(numerical_df)

In [None]:
type(numerical_df)

In [None]:
 features_names = pca.get_feature_names_out()
numerical_df = pd.DataFrame(numerical_df, columns=features_names)

In [None]:
type(numerical_df)

In [None]:
numerical_df.shape

In [None]:
numerical_df.head()

In [None]:
numerical_df.isnull().sum()

In [None]:
df = df[['id',
                   'Name',
                   'Comments',
                   'RedFlags Comments in Interview',
                   'Call-pitch Elements used during the call Sales Scenario',
                   "But, my child's exam are going on now, so we will keep the counselling session after the exams get over",
                  'Let me discuss it with my child',
                   "Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app.",'Role Location to be given to the candidate']]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x:x.split())

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['tags'] = df['Name'] + df['Comments'] + df['RedFlags Comments in Interview'] + df['Call-pitch Elements used during the call Sales Scenario'] + df["But, my child's exam are going on now, so we will keep the counselling session after the exams get over"] + df['Let me discuss it with my child'] + df["Sir being in education industry I know this is a marketing gimmick and at the end of the day you'll be selling the app."] + df['Role Location to be given to the candidate']

In [None]:
df.head()

In [None]:
df.loc[1,'tags']

In [None]:
df = df[['id','Name','tags']]

In [None]:
df.head(1)

## Stemming
**Note: Use Lemmatization for more accuracy**

To normalize words and reduce them to their root forms, we will apply **stemming**. This helps in handling variations of words and improves text processing efficiency for machine learning models.  
(e.g., "running" → "run")

In [None]:
ps = PorterStemmer()
ps

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def stem(text):
    y = []

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)    

In [None]:
''' Applying Stemming '''

df['tags'] = df['tags'].apply(stem)

In [None]:
df.head()

In [None]:
df.loc[1,'tags']

In [None]:
df.isnull().sum()

In [None]:
def stopwords_removal(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [None]:
''' Applying Stopwords Removal '''

df['tags'] = df['tags'].apply(stopwords_removal)

In [None]:
df.head(2)

In [None]:
df.loc[1,'tags']

In [None]:
def remove_duplicates(text):
    words = text.split()
    seen = set()
    unique_words = []

    for word in words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)

    return " ".join(unique_words)        

In [None]:
df['tags'] = df['tags'].apply(remove_duplicates)

In [None]:
df['tags'] = df['tags'].str.replace('/', '', regex=False)

In [None]:
df.loc[1,'tags']

In [None]:
df = df.join(numerical_df, how='inner')

In [None]:
df.shape

In [None]:
df.head(30)

In [None]:
df['pca0'].unique()

In [None]:
df.isnull().sum()

## Text Vectorization using Bag of Words  

Now that the tags are in text form, we will convert them into vector form using the **Bag of Words (BoW)** technique. This process transforms text into numerical representations, making it suitable for machine learning models.  

In [None]:
cv = CountVectorizer(max_features=5000)

In [None]:
vectors = cv.fit_transform(df['tags']).toarray()