In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from nltk import word_tokenize, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tests as t
import re
import nltk
from sklearn.preprocessing import OneHotEncoder
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords');

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dnuho\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dnuho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import Data

In [2]:
data = pd.read_csv('requests.csv')

In [3]:
data

Unnamed: 0,Request Description,Category
0,I have noticed I did not get the necessary pho...,Remuneration
1,I would like to enter a leave for last week bu...,Leave
2,I need to get my payslip.,Payroll
3,I have questions regarding my performance.,Performance and Talent
4,"I have decided to leave the company, I am tryi...",Leaving the company
5,I was expecting a higher annual salary increas...,Annual Salary Increase
6,I am facing technical issues with Success Fact...,Technical Issues
7,MY TECHNICAL PROBLEM IS STILL NOT RESOLVED,Technical Issues
8,I would like to switch to monthly remuneration.,Remuneration
9,"I ran out of personal leave, but I am sick, ho...",Leave


### Data Cleanup

#### Stemming, Lemmatization and Feature Extraction with CountVectorizer and TFIDF Vectorizer
#### Using custom preprocessor with lower_case, remove punctuation
#### Using custom tokenizer with Word_Tokenize, PorterStemmer, WordNetLemmatizer

In [4]:
## Define Preprocessor converting into lower case, removing punctuation
def preprocessor(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]"," ", text)
    text = re.sub(r"'","", text)
    return text

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

## Define tokenizer, splitting text into tokens, while also removing stopwords, applying stemming and lemmatization as well
def tokenizer(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    tokens = [stemmer.stem(lemmatizer.lemmatize(token)) for token in tokens]
    return tokens

CV = CountVectorizer(stop_words='english', max_features=1000, tokenizer=tokenizer, preprocessor=preprocessor)
TFIDF = TfidfVectorizer(stop_words='english', tokenizer = tokenizer, preprocessor=preprocessor)
training_data = CV.fit_transform(data['Request Description'])
training_data_tfidf = TFIDF.fit_transform(data['Request Description'])
training_output = data['Category']
CV.vocabulary_

{'notic': 20,
 'necessari': 18,
 'phone': 27,
 'reimburs': 35,
 'like': 15,
 'enter': 5,
 'leav': 14,
 'week': 47,
 'need': 19,
 'payslip': 23,
 'question': 30,
 'regard': 34,
 'perform': 24,
 'decid': 3,
 'compani': 1,
 'tri': 44,
 'long': 16,
 'period': 25,
 'expect': 6,
 'higher': 11,
 'annual': 0,
 'salari': 39,
 'increas': 12,
 'concern': 2,
 'reflect': 33,
 'face': 7,
 'technic': 43,
 'issu': 13,
 'success': 41,
 'factor': 8,
 'problem': 28,
 'resolv': 38,
 'switch': 42,
 'monthli': 17,
 'remuner': 36,
 'ran': 31,
 'person': 26,
 'sick': 40,
 'proceed': 29,
 'rate': 32,
 'unsatisfactori': 45,
 'way': 46,
 'object': 21,
 'half': 9,
 'pay': 22,
 'help': 10,
 'year': 49,
 'end': 4,
 'wowpeopl': 48,
 'request': 37}

In [5]:
# Initiate Classifiers for both CV and TFIDF methods
bag_mod = BaggingClassifier(n_estimators=200)
rf_mod = RandomForestClassifier(n_estimators=200)
bag_tfidf_mod = BaggingClassifier(n_estimators=200)
rf_tfidf_mod = RandomForestClassifier(n_estimators=200)

### Model Fitting to RandomForecastClassifier and Baggings Classifier

In [6]:
# Fit preprocessed and featurized data to random forest and baggings classifiers
rf_mod.fit(training_data, training_output)
bag_mod.fit(training_data, training_output)
rf_tfidf_mod.fit(training_data_tfidf, training_output)
bag_tfidf_mod.fit(training_data_tfidf, training_output);

In [7]:
test_data = ['Can you please let me know where my payslip is']
test_data2 = ['Facing technical problems regarding my salary']
test_data3 = ['I do not have enough leave days left']
test_data4 = ['I need help understanding my kpis are linked to how I perform']

#### Predict using test phrases

In [8]:
print(rf_mod.predict(CV.transform(test_data)))
print(rf_tfidf_mod.predict(TFIDF.transform(test_data)))

['Payroll']
['Payroll']


In [9]:
print(rf_mod.predict(CV.transform(test_data2)))
print(rf_tfidf_mod.predict(TFIDF.transform(test_data2)))

['Technical Issues']
['Technical Issues']


In [10]:
print(rf_mod.predict(CV.transform(test_data3)))
print(rf_tfidf_mod.predict(TFIDF.transform(test_data3)))

['Leave']
['Leave']


In [11]:
print(rf_mod.predict(CV.transform(test_data4)))
print(rf_tfidf_mod.predict(TFIDF.transform(test_data4)))

['Performance and Talent']
['Performance and Talent']
