## Import data and pre-processing

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('reviews_final_cleaned.csv')
df.head()

Unnamed: 0,id,reviews.val.date,overall_review,Location,Title,overall_rating,careerOpportunities_rating,compBenefits_rating,cultureValues_rating,seniorManagement_rating,worklifeBalance_rating,overview.industry,header.employerName,cleaned_review,cleaned_review2
0,44024,2020-01-01,Good benefits and flexible hours Micromanageme...,Singapore (Singapore),Consultant,3.0,3.0,3.0,3.0,3.0,4.0,Government Agencies,GovTech,"['good', 'benefit', 'flexible', 'hour', 'micro...","['good', 'benefit', 'flexibl', 'hour', 'microm..."
1,44024,2020-01-01,"Good culture, advanced skillsets to learn from...",Singapore (Singapore),Intern,5.0,4.0,3.0,5.0,4.0,4.0,Government Agencies,GovTech,"['good', 'culture', 'advanced', 'skillsets', '...","['good', 'cultur', 'advanc', 'skillset', 'lear..."
2,44024,2020-01-01,- Clean and bright offices \r\n- Friendly &amp...,Singapore (Singapore),Intern,4.0,4.0,4.0,5.0,4.0,5.0,Government Agencies,GovTech,"['clean', 'bright', 'office', 'friendly', 'amp...","['clean', 'bright', 'offic', 'friend', 'amp', ..."
3,44024,2019-09-21,there are many training program No management ...,Singapore (Singapore),Senior Manager,1.0,2.0,4.0,1.0,1.0,3.0,Government Agencies,GovTech,"['many', 'training', 'program', 'management', ...","['mani', 'train', 'program', 'manag', 'skill',..."
4,44054,2019-09-11,"no OT (work-life balance), just enough workloa...",Singapore (Singapore),IT Business Analyst,4.0,4.0,4.0,4.0,3.0,5.0,Healthcare Product Manufacturing,Essilor,"['ot', 'work', 'life', 'balance', 'enough', 'w...","['ot', 'work', 'life', 'balanc', 'enough', 'wo..."


In [2]:
#Review class proportion
print(df['overall_review'].count(),'\n\n',df['overall_rating'].value_counts(),'\n\n',
      df['overall_review'].count()==df['overall_rating'].count())

11742 

 5.0    4373
4.0    3691
3.0    1668
1.0    1272
2.0     738
Name: overall_rating, dtype: int64 

 True


In [3]:
#Remove NaN and blank 'review text' strings
df.dropna(inplace=True)

blanks=[]

for i in df.index:
    if type(df.iloc[i,2])==str:
        if df.iloc[i,2].isspace():
            blanks.append(i)

df.drop(blanks, inplace=True)

In [4]:
#Check if any NaN and blank strings were removed
print(df['overall_review'].count(),'\n\n',df['overall_rating'].value_counts(),'\n\n',
      df['overall_review'].count()==df['overall_rating'].count())

11742 

 5.0    4373
4.0    3691
3.0    1668
1.0    1272
2.0     738
Name: overall_rating, dtype: int64 

 True


## Import NLTK and VADER

In [5]:
import nltk
#Download "Vader" - Optional
#nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

## Apply VADER

In [6]:
#Apply VADER to 'overall_review' text column
df['scores'] = df['overall_review'].apply(lambda review: sid.polarity_scores(review))

In [7]:
df.head().scores

0    {'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'comp...
1    {'neg': 0.0, 'neu': 0.786, 'pos': 0.214, 'comp...
2    {'neg': 0.0, 'neu': 0.577, 'pos': 0.423, 'comp...
3    {'neg': 0.197, 'neu': 0.673, 'pos': 0.13, 'com...
4    {'neg': 0.1, 'neu': 0.773, 'pos': 0.127, 'comp...
Name: scores, dtype: object

In [8]:
#Split the VADER sentiments scores into columns in the df
df['positive'] = df['scores'].apply(lambda score_dict: score_dict['pos'])
df['negative'] = df['scores'].apply(lambda score_dict: score_dict['neg'])
df['neutral'] = df['scores'].apply(lambda score_dict: score_dict['neu'])
df['compound']  =df['scores'].apply(lambda score_dict: score_dict['compound'])

In [9]:
df.head()

Unnamed: 0,id,reviews.val.date,overall_review,Location,Title,overall_rating,careerOpportunities_rating,compBenefits_rating,cultureValues_rating,seniorManagement_rating,worklifeBalance_rating,overview.industry,header.employerName,cleaned_review,cleaned_review2,scores,positive,negative,neutral,compound
0,44024,2020-01-01,Good benefits and flexible hours Micromanageme...,Singapore (Singapore),Consultant,3.0,3.0,3.0,3.0,3.0,4.0,Government Agencies,GovTech,"['good', 'benefit', 'flexible', 'hour', 'micro...","['good', 'benefit', 'flexibl', 'hour', 'microm...","{'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'comp...",0.481,0.0,0.519,0.7506
1,44024,2020-01-01,"Good culture, advanced skillsets to learn from...",Singapore (Singapore),Intern,5.0,4.0,3.0,5.0,4.0,4.0,Government Agencies,GovTech,"['good', 'culture', 'advanced', 'skillsets', '...","['good', 'cultur', 'advanc', 'skillset', 'lear...","{'neg': 0.0, 'neu': 0.786, 'pos': 0.214, 'comp...",0.214,0.0,0.786,0.5994
2,44024,2020-01-01,- Clean and bright offices \r\n- Friendly &amp...,Singapore (Singapore),Intern,4.0,4.0,4.0,5.0,4.0,5.0,Government Agencies,GovTech,"['clean', 'bright', 'office', 'friendly', 'amp...","['clean', 'bright', 'offic', 'friend', 'amp', ...","{'neg': 0.0, 'neu': 0.577, 'pos': 0.423, 'comp...",0.423,0.0,0.577,0.8316
3,44024,2019-09-21,there are many training program No management ...,Singapore (Singapore),Senior Manager,1.0,2.0,4.0,1.0,1.0,3.0,Government Agencies,GovTech,"['many', 'training', 'program', 'management', ...","['mani', 'train', 'program', 'manag', 'skill',...","{'neg': 0.197, 'neu': 0.673, 'pos': 0.13, 'com...",0.13,0.197,0.673,-0.128
4,44054,2019-09-11,"no OT (work-life balance), just enough workloa...",Singapore (Singapore),IT Business Analyst,4.0,4.0,4.0,4.0,3.0,5.0,Healthcare Product Manufacturing,Essilor,"['ot', 'work', 'life', 'balance', 'enough', 'w...","['ot', 'work', 'life', 'balanc', 'enough', 'wo...","{'neg': 0.1, 'neu': 0.773, 'pos': 0.127, 'comp...",0.127,0.1,0.773,0.1531


In [10]:
#Check y-label data type
df['overall_rating'].dtype

dtype('float64')

In [11]:
# Create Feature (VADER scores - Predictor) and Label (Rating - Prediction) sets for rating classifier using ML
X = df[['positive','negative','neutral','compound']]
y = df['overall_rating']

## XGBoost Classifier (Search for params with CV)

We perform a rating classification with the VADER scores using a classifer: XGBoost. We first obtain the optimal parameters for the XGBoost classifier through hyperparameter search function and then apply those for final prediction.

In [12]:
import xgboost as xgb
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import make_scorer, roc_auc_score, confusion_matrix, classification_report, matthews_corrcoef

We base the optimal parameter search using scoring: MCC (Matthews correlation coefficient). The coefficient takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. It can be argued that predicting rating (1) vs (3) vs (5) wrongly may be equally costly, hence the balance approach may be appropriate.

In [241]:
xgb_model_cv = xgb.XGBClassifier(objective = 'multi:softprob')

params = {'n_estimators': [1000],
              'learning_rate': stats.uniform(0.01, 0.07),
              'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.45),
              'min_child_weight': [1, 2, 3]
             }

MCC_scoring = make_scorer(matthews_corrcoef)
search = RandomizedSearchCV(xgb_model_cv, param_distributions=params, scoring=MCC_scoring,
                            random_state=0, n_iter=25, cv=5, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X, y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 49.3min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='multi:softprob',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, sca...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C453F98788>,
                                        'max

In [242]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

Best model iteration achieve a 5-fold CV score of mean 0.915 with standard deviation of 0.009.

In [243]:
#Obtain the best 5-fold cv model results
report_best_scores(search.cv_results_, 1)

Model with rank: 1
Mean validation score: 0.915 (std: 0.009)
Parameters: {'colsample_bytree': 0.8892961525874232, 'learning_rate': 0.04562782724669102, 'max_depth': 9, 'min_child_weight': 2, 'n_estimators': 1000, 'subsample': 0.9448103271660397}



In [271]:
#All hyperparameter search results
search.cv_results_

{'mean_fit_time': array([18.5461432 , 12.50150852, 27.07565579, 22.06939516, 16.76081247,
        12.15599484, 22.65410304, 11.37326388, 26.19000049, 15.07591791,
        24.48313818, 17.40272365, 25.49526124, 15.59579253, 18.03301096,
        17.48630133, 12.35990276, 15.49015164, 21.70415258, 25.29574175,
        11.96460919, 21.43225455, 24.15384965, 25.23164802, 20.4558188 ]),
 'std_fit_time': array([0.1772133 , 0.18150479, 0.09090052, 0.27783403, 0.16009472,
        0.1093701 , 0.21670458, 0.20740178, 0.11631261, 0.25245598,
        0.4368029 , 0.31646421, 0.26886075, 0.23364289, 0.20251664,
        0.26423259, 0.20295659, 0.05990511, 0.16506018, 0.13333402,
        0.2180399 , 0.20723501, 0.16057322, 0.36456187, 0.29832673]),
 'mean_score_time': array([0.83153381, 0.38442273, 1.58826389, 1.04600105, 0.63585463,
        0.37437644, 0.93984699, 0.34843755, 1.21667042, 0.45500469,
        0.98071642, 1.09135356, 1.3082612 , 0.62728653, 1.0156394 ,
        1.0960259 , 0.3958025 , 0.6

## XGBoost Classifier (Predict with optimised param)

We predict using the above optimised parameters. 

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (10567, 4)
Testing Data Shape:  (1175, 4)


In [14]:
xgb_model = xgb.XGBClassifier(silent=False, 
                              scale_pos_weight=1,
                              learning_rate=0.04562782724669102,  
                              colsample_bytree = 0.8892961525874232,
                              subsample = 0.9448103271660397,
                              objective='multi:softprob', 
                              n_estimators=1000, 
                              reg_alpha = 0.3,
                              min_child_weight = 2,
                              max_depth=9, 
                              gamma=1)

In [15]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8892961525874232, gamma=1,
              learning_rate=0.04562782724669102, max_delta_step=0, max_depth=9,
              min_child_weight=2, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0.3, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=False, subsample=0.9448103271660397, verbosity=1)

In [16]:
y_pred = xgb_model.predict(X_test)

Achieved an MCC-score of 0.918.

In [17]:
#Print scores
print(confusion_matrix(y_test, y_pred),'\n')
print(classification_report(y_test,y_pred),'\n')
print('MCC-score:', matthews_corrcoef(y_test,y_pred))

[[112   0   2   1   3]
 [  2  67   2   2   2]
 [  1   1 134  13  11]
 [  0   3   4 361  12]
 [  0   0   3   7 432]] 

              precision    recall  f1-score   support

         1.0       0.97      0.95      0.96       118
         2.0       0.94      0.89      0.92        75
         3.0       0.92      0.84      0.88       160
         4.0       0.94      0.95      0.95       380
         5.0       0.94      0.98      0.96       442

    accuracy                           0.94      1175
   macro avg       0.94      0.92      0.93      1175
weighted avg       0.94      0.94      0.94      1175
 

MCC-score: 0.918342304757046
