# Textual preprocessing and simple feature extraction
***
## Workflow
1. Imports
2. Data cleaning
3. Simple feature creation

## 1. Imports

In [1]:
import numpy as np
import pandas as pd
import os
import nltk
import re
import librosa
import string

In [3]:
name_prefix = 'data/interview_transcripts_by_turkers'
dataset_csv_name = name_prefix + '.csv'

df = pd.read_csv(dataset_csv_name, header=None)

## 2. Data cleaning

In [4]:
def remove_interview_indicators(string):
    out = re.sub("Interviewer:[a-zA-Z0-9\.\?\!\ \']*\|", "", string) # remove interviewer from the text
    out = re.sub("Interviewer:[a-zA-Z0-9\.\?\!\ \']*", "", out) # remove last interviewer statement from text
    out = re.sub("Interviewee:|\|Interviewee:", "", out) # remove Interviewee indicator from text
    out = re.sub("\|", "", out) # remove remaining |
    out = re.sub("\ \ ", " ", out) # remove double white spaces (still some in there)
    out = re.sub("’|'", "", out) # remove double white spaces (still some in there)
    return out

In [102]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [99]:
# get the duration of the audio clips

def get_duration_lib(name):
    name = name.upper()
    try:
        sec = librosa.get_duration(filename=f'Audio/{name}.wav')
    except:
        sec = 0
    
    return sec

In [5]:
transformed_interviews = df.copy()
transformed_interviews[1] = transformed_interviews[1].transform(remove_interview_indicators)

average number of words spoken per minute, the average number of unique words per minute, count of unique words in the transcript, and the number of filler words used per minute. (Leveraging Multimodal Behavioral Analytics for Automated Job Interview Performance Assessment and Feedback)

wps Words per second, uwps Unique words per second, fwps Filler words per second, wc Total number of words, uwc Total number of unique words (Automated Analysis and Prediction of Job Interview Performance)

In [None]:
transformed_interviews[2] = transformed_interviews[0].transform(get_duration_lib)
transformed_interviews['remove_punctation'] = transformed_interviews[1].transform(remove_punctuations)
transformed_interviews['tokenize'] = transformed_interviews['remove_punctation'].transform(nltk.word_tokenize)

Number of filler words and non fluencies LIWC

In [None]:
# words per second 
transformed_interviews['wps'] = transformed_interviews['tokenize'].transform(len) / transformed_interviews[2]
transformed_interviews['wc'] = transformed_interviews['tokenize'].transform(len)

In [105]:
# unique words
sno = nltk.stem.SnowballStemmer('english')

transformed_interviews['stemming'] = transformed_interviews['tokenize'].transform(lambda words: [sno.stem(word) for word in words])
transformed_interviews['stemming']

transformed_interviews['uwc'] = transformed_interviews['stemming'].transform(lambda words: len(set(words)))
transformed_interviews['uwps'] = transformed_interviews['uwc'] / transformed_interviews[2]
transformed_interviews.head()

Unnamed: 0,0,1,2,remove_punctation,tokenize,wps,wc,stemming,uwc,uwps
0,p1,Im pretty good. ok uhm so have you looked at ...,196.992,Im pretty good ok uhm so have you looked at m...,"[Im, pretty, good, ok, uhm, so, have, you, loo...",2.695541,531,"[im, pretti, good, ok, uhm, so, have, you, loo...",231,1.172636
1,p10,Great how about you? Im a little [???] by the...,426.0,Great how about you Im a little by the resur...,"[Great, how, about, you, Im, a, little, by, th...",2.389671,1018,"[great, how, about, you, im, a, littl, by, the...",303,0.711268
2,p11,Uhh Im a junior at MIT uhh Im double majoring...,271.992,Uhh Im a junior at MIT uhh Im double majoring...,"[Uhh, Im, a, junior, at, MIT, uhh, Im, double,...",2.441248,664,"[uhh, im, a, junior, at, mit, uhh, im, doubl, ...",228,0.83826
3,p12,Im good how are you? Ok so Im a Junior at MIT...,204.984,Im good how are you Ok so Im a Junior at MIT ...,"[Im, good, how, are, you, Ok, so, Im, a, Junio...",2.995356,614,"[im, good, how, are, you, ok, so, im, a, junio...",215,1.048862
4,p13,Good. Ok umm Im currently a junior at M.I.T. ...,294.0,Good Ok umm Im currently a junior at MIT stud...,"[Good, Ok, umm, Im, currently, a, junior, at, ...",1.914966,563,"[good, ok, umm, im, current, a, junior, at, mi...",211,0.717687


In [113]:
transformed_interviews.rename(columns = {0:'Person', 1:'text_unprocessed', 2:'interview_length'}, inplace = True)
output = transformed_interviews[['Person', 'text_unprocessed', 'tokenize', 'stemming', 'interview_length', 'wc', \
                                 'wps', 'uwc', 'uwps', 'remove_punctation']]
output = output.drop('remove_punctation',  axis=1)
output.to_csv('wordcount_uniquewordcount.csv', index=False)