In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv("new_interview_data.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2151 entries, 0 to 2150
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   interview_type  2151 non-null   object
 1   question        2151 non-null   object
 2   answer          2151 non-null   object
 3   category        2151 non-null   object
dtypes: object(4)
memory usage: 67.3+ KB


In [5]:
data.head()

Unnamed: 0,interview_type,question,answer,category
0,behavioural,Describe a time when you had to meet a tight d...,I had a project that had a deadline approachin...,poor
1,situational,You are leading a team to implement a new soft...,I would first have a private conversation with...,average
2,situational,You are tasked with designing a new website fo...,I would schedule regular check-ins with the cl...,great
3,behavioural,Can you give an example of a time when you had...,I had a project that required using a new prog...,poor
4,behavioural,Describe a situation where you had to deal wit...,I had a project where different stakeholders h...,average


In [6]:
data.category.value_counts()

category
great      1238
average     484
poor        429
Name: count, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2151 entries, 0 to 2150
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   interview_type  2151 non-null   object
 1   question        2151 non-null   object
 2   answer          2151 non-null   object
 3   category        2151 non-null   object
dtypes: object(4)
memory usage: 67.3+ KB


##### Preprocessing

In [8]:
#mapping labels 
category_mapping = {'poor': 0, 'average': 1, 'great': 2}

# Map the values to the 'category' column
data['mapped_category'] = data['category'].map(category_mapping)


In [9]:
data.head()

Unnamed: 0,interview_type,question,answer,category,mapped_category
0,behavioural,Describe a time when you had to meet a tight d...,I had a project that had a deadline approachin...,poor,0
1,situational,You are leading a team to implement a new soft...,I would first have a private conversation with...,average,1
2,situational,You are tasked with designing a new website fo...,I would schedule regular check-ins with the cl...,great,2
3,behavioural,Can you give an example of a time when you had...,I had a project that required using a new prog...,poor,0
4,behavioural,Describe a situation where you had to deal wit...,I had a project where different stakeholders h...,average,1


In [10]:
data["interview_type"] = data["interview_type"].str.lower()

In [11]:
corrections = {'behavioral':'behavioural'}

In [12]:
data["interview_type"]= data["interview_type"].replace(corrections)

In [13]:
row_idx = data[data['interview_type']== "answer"].index

In [14]:
# dropped = data.drop(data[data['interview_type']== "answer"].index)
# dropped.interview_type.value_counts()

In [15]:
#new_data = data.drop([row_idx[0]])
#new_data.interview_type.value_counts()

In [17]:
data['interview_type'].value_counts()

interview_type
behavioural    1174
situational     976
answer            1
Name: count, dtype: int64

In [20]:
data.drop([row_idx[0]], inplace= True)

In [21]:
data['interview_type'].value_counts()

interview_type
behavioural    1174
situational     976
Name: count, dtype: int64

In [22]:
interview_type_map = {'behavioural' : 1, 'situational' : 2}

data['interview_map'] = data['interview_type'].map(interview_type_map)

In [28]:
data.head()

Unnamed: 0,interview_type,question,answer,category,mapped_category,interview_map
0,behavioural,Describe a time when you had to meet a tight d...,I had a project that had a deadline approachin...,poor,0,1
1,situational,You are leading a team to implement a new soft...,I would first have a private conversation with...,average,1,2
2,situational,You are tasked with designing a new website fo...,I would schedule regular check-ins with the cl...,great,2,2
3,behavioural,Can you give an example of a time when you had...,I had a project that required using a new prog...,poor,0,1
4,behavioural,Describe a situation where you had to deal wit...,I had a project where different stakeholders h...,average,1,1


In [42]:
data.drop(["interview_type", "category"], axis=1, inplace=True)

In [44]:
data.head()

Unnamed: 0,question,answer,mapped_category,interview_map
0,Describe a time when you had to meet a tight d...,I had a project that had a deadline approachin...,0,1
1,You are leading a team to implement a new soft...,I would first have a private conversation with...,1,2
2,You are tasked with designing a new website fo...,I would schedule regular check-ins with the cl...,2,2
3,Can you give an example of a time when you had...,I had a project that required using a new prog...,0,1
4,Describe a situation where you had to deal wit...,I had a project where different stakeholders h...,1,1


In this reformed dataset, the categories are mapped as:
poor - 0,
average - 1,
great - 2

and the interview type is mapped as 
behavioural questions - 1 and situational questions - 2

In [46]:
data['question'] = data['question'].str.lower()

In [47]:
data['answer'] = data['answer'].str.lower()

In [48]:
data.head()

Unnamed: 0,question,answer,mapped_category,interview_map
0,describe a time when you had to meet a tight d...,i had a project that had a deadline approachin...,0,1
1,you are leading a team to implement a new soft...,i would first have a private conversation with...,1,2
2,you are tasked with designing a new website fo...,i would schedule regular check-ins with the cl...,2,2
3,can you give an example of a time when you had...,i had a project that required using a new prog...,0,1
4,describe a situation where you had to deal wit...,i had a project where different stakeholders h...,1,1


#### Punctuations

In [52]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [54]:
def remove_punc(text):
    punc = string.punctuation
    return text.translate(str.maketrans('', '', punc))

In [55]:
data['answer'] = data['answer'].apply(lambda x: remove_punc(x))

In [56]:
data['question'] = data['question'].apply(lambda x: remove_punc(x))

In [58]:
data.answer[3]

'i had a project that required using a new programming language that i was unfamiliar with i dedicated extra time to selfstudy utilized online resources and sought guidance from colleagues to quickly grasp the basics and successfully complete the project on time'

#### System with stopword removal

In [64]:
import nltk
from nltk.corpus import stopwords

In [65]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayoyinka/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [66]:
stop_words = set(stopwords.words('english'))

In [67]:
def remove_stopwords(txt):
    return " ".join([word for word in txt.split() if word not in stop_words])

In [68]:
data['answer'] = data['answer'].apply(lambda x: remove_stopwords(x))

In [69]:
data['question'] = data['question'].apply(lambda x: remove_stopwords(x))

In [73]:
data.head()

Unnamed: 0,question,answer,mapped_category,interview_map
0,describe time meet tight deadline project prio...,project deadline approaching quickly focused b...,0,1
1,leading team implement new software system tea...,would first private conversation team member u...,1,2
2,tasked designing new website client keep chang...,would schedule regular checkins client ensure ...,2,2
3,give example time learn new technology quickly...,project required using new programming languag...,0,1
4,describe situation deal conflicting priorities...,project different stakeholders conflicting pri...,1,1
