In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.insert(0, '/Users/leon/Income/python files/politeness_code')
# Load customized packages
from helper.customized_pos import *
from helper.pos_helper import *
from helper.politeness_helper import *
from helper.sentiment_helper import *
from helper.lexicons import *

### Input transcripts

In [3]:
# Load from text file
with open ("../data_collection/greeting_transcripts.txt") as f:
    greeting_transcripts_ls = f.readlines()
with open ("../data_collection/ending_transcripts.txt") as f:
    ending_transcripts_ls = f.readlines()

In [4]:
print("greeting transcripts", greeting_transcripts_ls)
print("\n")
print("ending transcripts", ending_transcripts_ls)

greeting transcripts ['ya hello good afternoon speak to nanny seah please afternoon miss nanny my name is jaguar shao and im actually calling from insurance company miss nanny good time to speak for a while this is regarding our partnership charter_plus members and as a result of that insurance company actually formulated a very special thirtieth anniversay insurance bundle right called the i thirty\n', 'hello good afternoon just speak to miss leon michael from income ntuc free for one_or_two minutes if you are not busy okay calling behalf of your adviser xiao guo okay because we having this anniversary plan for the family i just check again you are single_or_married\n']


ending transcripts ['You just reconfirm with your husband whether you already have an enhanced home insurance or not if already have then we can not cover for you oh ok I will call you back tomorrow thank you bye\n', 'So maybe if your friends or relatives or family members are interested you can call back at this num

### Load sentence transformer model

In [5]:
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [44]:
import torch
sentence_model.save('../../model_collection')

### Encode sentences

In [6]:
greeting_encode = sentence_model.encode(greeting_transcripts_ls)
print(len(greeting_encode))

2


In [7]:
ending_encode = sentence_model.encode(ending_transcripts_ls)
print(len(ending_encode))

2


In [8]:
greeting_encode

array([[-7.41549432e-02,  3.94416004e-02,  2.13413015e-02,
        -6.92394152e-02, -1.45531585e-02,  3.00129838e-02,
         1.50267273e-01,  5.60768368e-03, -4.80575487e-03,
        -2.07905266e-02,  5.15650250e-02, -1.47423018e-02,
         4.33658399e-02, -1.66828502e-02, -1.48129824e-03,
        -2.28996798e-02,  3.97074670e-02, -2.10626703e-02,
        -9.32612047e-02, -3.11144232e-03, -6.20206036e-02,
         5.37114032e-02,  5.55090047e-03,  5.22897637e-04,
        -9.04180035e-02, -1.14847431e-02, -3.60804088e-02,
         4.57358360e-02, -1.85318310e-02, -4.77426723e-02,
         2.75681354e-02,  3.55365239e-02,  7.70426542e-02,
        -2.97547281e-02,  4.53562997e-02,  1.30115291e-02,
        -7.21666077e-03, -1.78030394e-02, -7.94330686e-02,
        -5.75921163e-02, -8.98533966e-03, -8.10112283e-02,
        -8.00338238e-02, -1.28195519e-02, -3.43606435e-02,
        -1.90115348e-02,  3.15257022e-03,  9.09183733e-03,
         5.99541478e-02,  7.46275624e-03, -4.88756448e-0

### Calculate sentence similarity

In [9]:
def construct_sentence_vector(sentence, model):
    return model.encode(sentence)

In [10]:
def construct_dim_vector(descriptive_words, model):
    return np.array([construct_sentence_vector(sentence, model) for sentence in descriptive_words]).mean(axis = 0)

In [11]:
def euclidian_distance(X, Y):
    return np.sqrt(np.sum(np.power(X-Y, 2)))

In [12]:
def cosine_sim(X, Y):
    return np.dot(X, Y)/(np.linalg.norm(X) * np.linalg.norm(Y))

In [23]:
def match_category(input_sentence, model, lexicon_type = "greeting",
                  similarity_threshold = 0.4):
    if lexicon_type == "greeting":
        dic = greeting_lexicons
    elif lexicon_type == "ending":
        dic = ending_lexicons
    new_vector = construct_sentence_vector(input_sentence, model)
    similarity_ls = []
    classes = []
    for aspect, descriptive_words in dic.items():
        classes.append(aspect)
        cur_vector = construct_dim_vector(descriptive_words, model)
        cur_similarity = cosine_sim(new_vector, cur_vector)
        similarity_ls.append(cur_similarity)
    max_similarity = max(similarity_ls)
    if max_similarity < similarity_threshold:
        return ("no_matching", input_sentence)
    result_classes = classes[similarity_ls.index(max_similarity)]
    return [result_classes, input_sentence]

In [24]:
def batch_match_category(sentence_ls, model, lexicon_type = "greeting"):
    result_ls = []
    for sentence in sentence_ls:
        result_ls.append(match_category(sentence, model, lexicon_type))
    return result_ls

In [25]:
# result_ls is the result of batch_match_category function
# bool_group = True: group sentences with same label together
def nlp_aspect_matching(sentence_ls, model, lexicon_type = "greeting", bool_group = True):
    result_ls = batch_match_category(sentence_ls, model, lexicon_type) # generate label for each sentence
    if not bool_group or len(result_ls) == 1: # if the user don't want to cluster same label sentences, just return the result_ls,     # Or if the result_ls only contains one sentence, just return it
        return result_ls
    else: # If there are more than 1 sentence in the list and bool_group = True
        new_result_ls = cluster_category(result_ls)
        return new_result_ls

In [26]:
# Assume match_result_ls has length > 1
# Cluster contigious sentences with same category together
def cluster_category(match_result_ls):
    sen_store = match_result_ls[0][1]
    temp_label = match_result_ls[0][0]
    new_result_ls = []
    for i in range(1, len(match_result_ls)):
        cur_label = match_result_ls[i][0]
        cur_sen = match_result_ls[i][1]
        if cur_label == temp_label:
            sen_store += " " + cur_sen
        else:
            new_result_ls.append([temp_label, sen_store])
            sen_store = cur_sen
            temp_label = cur_label
    if sen_store:
        new_result_ls.append([temp_label, sen_store])
    return new_result_ls

In [28]:
# test above function from mapping sentence to each category
greeting_ls = ['ya hello good afternoon speak to nanny seah', 'please afternoon miss nanny my name is jaguar shao and', 'i m actually calling from insurance company miss nanny good time to speak for a while this is regarding our partnership charter_plus members and', 'as a result of that insurance company actually formulated a very special thirtieth anniversay insurance bundle right called the i thirty \n']
category_ls = nlp_aspect_matching(greeting_ls, sentence_model, "greeting", True)
print(category_ls)

[['opening', 'ya hello good afternoon speak to nanny seah'], ['no_matching', 'please afternoon miss nanny my name is jaguar shao and'], ['purpose_of_call', 'i m actually calling from insurance company miss nanny good time to speak for a while this is regarding our partnership charter_plus members and as a result of that insurance company actually formulated a very special thirtieth anniversay insurance bundle right called the i thirty \n']]


In [30]:
greeting_ls2 = ['hello good afternoon just speak to miss leon michael from income ntuc free for one_or_two minutes if you are not busy', 'okay calling behalf of your adviser xiao guo', 'okay because we having this anniversary plan for the family i just check again you are single_or_married \n']

In [31]:
category_ls2 = nlp_aspect_matching(greeting_ls2, sentence_model, "greeting", True)
print(category_ls2)

[['opening', 'hello good afternoon just speak to miss leon michael from income ntuc free for one_or_two minutes if you are not busy'], ['no_matching', 'okay calling behalf of your adviser xiao guo'], ['purpose_of_call', 'okay because we having this anniversary plan for the family i just check again you are single_or_married \n']]


In [36]:
ending_ls = ['You just reconfirm with your husband whether you already have an enhanced home insurance or', 'not if already have then we can not cover for you oh ok I will call you back tomorrow', 'thank you bye \n']

In [37]:
category_ls3 = nlp_aspect_matching(ending_ls, sentence_model, "ending", True)
print(category_ls3)

[['no_matching', 'You just reconfirm with your husband whether you already have an enhanced home insurance or'], ['follow-up', 'not if already have then we can not cover for you oh ok I will call you back tomorrow'], ['closing', 'thank you bye \n']]


In [38]:
ending_ls2 = ['So maybe if your friends or relatives or family members are interested you', 'can call back at this number lah this number you', 'can see from your phone lah thank you bye bye \n']

In [39]:
category_ls4 = nlp_aspect_matching(ending_ls2, sentence_model, "ending", True)
print(category_ls4)

[['no_matching', 'So maybe if your friends or relatives or family members are interested you'], ['follow-up', 'can call back at this number lah this number you'], ['closing', 'can see from your phone lah thank you bye bye \n']]
