In [1]:
import nltk
import spacy
nltk.download('punkt')

import pickle
# load model packages
import xgboost as xgb
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Load some metrics
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import classification_report

# Load other sklearn packages
from sklearn.model_selection import GridSearchCV

# Load other packages
from tqdm import tqdm

[nltk_data] Downloading package punkt to /Users/leon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load spacy model
nlp = spacy.load('en_core_web_lg')

In [3]:
# Load custom model/data
xgb_model = pickle.load(open('model_collection/final_xgb_v2.sav', 'rb'))

# Load data
with open('data_collection/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('data_collection/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('data_collection/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('data_collection/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)


In [4]:
# Check xgb model
# Predict
xgb_result = xgb_model.predict(X_test)
# Pring classification report
print(classification_report(y_test, xgb_result))

              precision    recall  f1-score   support

         0.0       0.67      0.66      0.67      1992
         1.0       0.67      0.68      0.67      1992

    accuracy                           0.67      3984
   macro avg       0.67      0.67      0.67      3984
weighted avg       0.67      0.67      0.67      3984



Ok, now xgb model is good for using now. Let's try some sample!!!

In [5]:
# Load from helper.py with two helper functions
from helper import *

In [6]:
# Function to loop through text files
def load_sentence(file_name, model = xgb_model, score_format = "prob"):
    with open(file_name, 'r') as f:
        txt_ls = f.readlines()
    result_dict = {}
    for i, sen in enumerate(txt_ls):
        cur_parse = format_doc(sen)
        cur_score = customize_score(cur_parse, model, score_format)
        result_dict[sen] = cur_score
    return result_dict

In [7]:
def print_result(result_dict):
    for key, value in result_dict.items():
        print("current sentence: ", key)
        print("politeness score: ", value)

### Sample text for greetings

In [8]:
greeting_dict = load_sentence('data_collection/greeting_sentence.txt')
print_result(greeting_dict)

current sentence:  hi this is leon calling from income ntuc. is it a good time to speak for a while?

politeness score:  {'polite': 0.92927027, 'impolite': 0.07072973}
current sentence:  ya hello good afternoon speak to leon please. afternoon mister leon my name is jimmy and im actually calling from income.

politeness score:  {'polite': 0.47284272, 'impolite': 0.5271573}
current sentence:  ya you see even just now he message me he said wah i wanna die.

politeness score:  {'polite': 0.5830819, 'impolite': 0.4169181}
current sentence:  currently we partnership with abcd. can i check with you is this a good time to speak with you right now?

politeness score:  {'polite': 0.65582114, 'impolite': 0.34417886}
current sentence:  good evening may i speak to mister leon please? my name is Leon and im calling from Income. may I speak with you for a few minutes?

politeness score:  {'polite': 0.90521586, 'impolite': 0.09478414}
current sentence:  thank you for calling income. this is leon how m

### Sample text for endings

In [9]:
ending_dict = load_sentence('data_collection/ending_sentence.txt')
print_result(ending_dict)

current sentence:  no problem i will send you the information shortly. then i can give you a follow call friday around five.

politeness score:  {'polite': 0.44172865, 'impolite': 0.55827135}
current sentence:  maybe i give you a call back. do you prefer to call around this one after lunch time.

politeness score:  {'polite': 0.4616409, 'impolite': 0.5383591}
current sentence:  okay thank you so much for your time.

politeness score:  {'polite': 0.9308717, 'impolite': 0.069128275}
current sentence:  then okay i will call again on tuesday bye bye.

politeness score:  {'polite': 0.42934066, 'impolite': 0.57065934}
current sentence:  sure I can call you back on next tuesday. thanks so much.
politeness score:  {'polite': 0.8740092, 'impolite': 0.12599081}


### Show some findings about the politeness model

1. "ya" will affect severely on the model result

In [10]:
with_ya = "ya hello good afternoon speak to leon please. afternoon mister leon my name is jimmy and im actually calling from income"
without_ya = "hello good afternoon speak to leon please. afternoon mister leon my name is jimmy and im actually calling from income"
print("with ya score: ", customize_score(format_doc(with_ya), xgb_model, "prob"))
print("without ya score: ", customize_score(format_doc(without_ya), xgb_model, "prob"))

with ya score:  {'polite': 0.47284272, 'impolite': 0.5271573}
without ya score:  {'polite': 0.7473273, 'impolite': 0.25267267}


2. many of the ending sentences are more impolite than greeting sentences. Referring to the result shows above in "Sample text for endings"

3. Many sentences may have ambiguitity around politeness and impoliteness. The model can't capture that ambiguitity.