In [1]:
!pip install umap-learn



In [2]:
!pip install fast_hdbscan



In [3]:
!pip install hdbscan



In [4]:
!pip install spacy



In [5]:
import pandas as pd
import random
import umap
import numpy as np
import hdbscan
import spacy
import collections

In [6]:
df = pd.read_csv("banking_embedding.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,text,category,embedding
0,0,I am still waiting on my card?,card_arrival,[-3.53524163e-02 -4.21367399e-02 -2.75710248e-...
1,1,What can I do if my card still hasn't arrived ...,card_arrival,[ 2.25937311e-02 -1.35310916e-02 2.42646467e-...
2,2,I have been waiting over a week. Is the card s...,card_arrival,[-4.60113622e-02 -1.99005213e-02 -1.47550728e-...
3,3,Can I track my card while it is in the process...,card_arrival,[-8.46180320e-03 -1.86706576e-02 -3.45075577e-...
4,4,"How do I know if I will get my card, or if it ...",card_arrival,[-2.35078353e-02 -1.60232875e-02 8.56959447e-...
...,...,...,...,...
9998,9998,You provide support in what countries?,country_support,[-4.46434831e-03 1.89972986e-02 3.30669992e-...
9999,9999,What countries are you supporting?,country_support,[ 1.93479229e-02 -2.52135955e-02 2.79338993e-...
10000,10000,What countries are getting support?,country_support,[ 7.70830084e-03 -7.47324750e-02 4.63454723e-...
10001,10001,Are cards available in the EU?,country_support,[ 7.93154293e-04 2.72115413e-02 -4.71094325e-...


In [8]:
def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

In [9]:
def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost

In [10]:
def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in range(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        print(f"Evaluation {i}: n_neighbors {n_neighbors} n_components {n_components} min_cluster_size {min_cluster_size}")
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size, 
                                     random_state = space['random_state'])
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, 
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [11]:
space = {
    "n_neighbors": range(100, 150),
    "n_components": range(2, 3),
    "min_cluster_size": range(100, 300),
    "random_state": None
}
df["embeddings_np"] = df['embedding'].apply(lambda x: 
                                   np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' ').astype('float'))
# df["embeddings_np"] = df['embeddings_np'].apply(lambda x: np.array(x, dtype='float'))
# embeddings = [np.array(v, dtype='float') for v in embeddings]

random_use = random_search([v for v in df["embeddings_np"]], space, 10)

Evaluation 0: n_neighbors 140 n_components 2 min_cluster_size 267
Evaluation 1: n_neighbors 106 n_components 2 min_cluster_size 162
Evaluation 2: n_neighbors 149 n_components 2 min_cluster_size 203
Evaluation 3: n_neighbors 127 n_components 2 min_cluster_size 230
Evaluation 4: n_neighbors 119 n_components 2 min_cluster_size 294
Evaluation 5: n_neighbors 102 n_components 2 min_cluster_size 268
Evaluation 6: n_neighbors 123 n_components 2 min_cluster_size 158
Evaluation 7: n_neighbors 146 n_components 2 min_cluster_size 139
Evaluation 8: n_neighbors 118 n_components 2 min_cluster_size 286
Evaluation 9: n_neighbors 143 n_components 2 min_cluster_size 245


In [12]:
random_use

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost
1,1,106,2,162,8,0.044887
3,3,127,2,230,7,0.046386
6,6,123,2,158,8,0.053684
4,4,119,2,294,6,0.06748
7,7,146,2,139,9,0.081576
5,5,102,2,268,8,0.10037
2,2,149,2,203,8,0.104569
8,8,118,2,286,8,0.124363
0,0,140,2,267,8,0.126262
9,9,143,2,245,9,0.139958


In [13]:
best_clustering = random_use.iloc[0]
best_clustering

run_id                1.000000
n_neighbors         106.000000
n_components          2.000000
min_cluster_size    162.000000
label_count           8.000000
cost                  0.044887
Name: 1, dtype: float64

In [14]:
best_clusters = generate_clusters([v for v in df["embeddings_np"]],
                                   int(best_clustering['n_neighbors']),
                                   int(best_clustering['n_components']), 
                                   int(best_clustering['min_cluster_size']),
                                   random_state = None)

In [15]:
df['label'] = best_clusters.labels_
df

Unnamed: 0.1,Unnamed: 0,text,category,embedding,embeddings_np,label
0,0,I am still waiting on my card?,card_arrival,[-3.53524163e-02 -4.21367399e-02 -2.75710248e-...,"[-0.0353524163, -0.0421367399, -0.00275710248,...",7
1,1,What can I do if my card still hasn't arrived ...,card_arrival,[ 2.25937311e-02 -1.35310916e-02 2.42646467e-...,"[0.0225937311, -0.0135310916, 0.0242646467, 0....",7
2,2,I have been waiting over a week. Is the card s...,card_arrival,[-4.60113622e-02 -1.99005213e-02 -1.47550728e-...,"[-0.0460113622, -0.0199005213, -0.00147550728,...",7
3,3,Can I track my card while it is in the process...,card_arrival,[-8.46180320e-03 -1.86706576e-02 -3.45075577e-...,"[-0.0084618032, -0.0186706576, -0.0345075577, ...",7
4,4,"How do I know if I will get my card, or if it ...",card_arrival,[-2.35078353e-02 -1.60232875e-02 8.56959447e-...,"[-0.0235078353, -0.0160232875, 0.00856959447, ...",7
...,...,...,...,...,...,...
9998,9998,You provide support in what countries?,country_support,[-4.46434831e-03 1.89972986e-02 3.30669992e-...,"[-0.00446434831, 0.0189972986, 0.0330669992, -...",7
9999,9999,What countries are you supporting?,country_support,[ 1.93479229e-02 -2.52135955e-02 2.79338993e-...,"[0.0193479229, -0.0252135955, 0.0279338993, -0...",7
10000,10000,What countries are getting support?,country_support,[ 7.70830084e-03 -7.47324750e-02 4.63454723e-...,"[0.00770830084, -0.074732475, 0.0463454723, -0...",7
10001,10001,Are cards available in the EU?,country_support,[ 7.93154293e-04 2.72115413e-02 -4.71094325e-...,"[0.000793154293, 0.0272115413, -0.0471094325, ...",7


In [16]:
def most_common(lst, n_words):
        """
        Return most common n words in list of words

        Arguments:
            lst: list of words
            n_words: int, number of top words by frequency to return

        Returns:
            counter.most_common(n_words): a list of the n most common elements
                                          and their counts from the most
                                          common to the least

        """

        counter = collections.Counter(lst)

        return counter.most_common(n_words)

In [17]:
def extract_labels(category_docs):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns
    """
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading language model for the spaCy dependency parser\n"
              "(only required the first time this is run)\n")
        from spacy.cli import download
        download("en_core_web_sm")
        nlp = spacy.load("en_core_web_sm")

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

In [19]:
category_docs_ = {}
clusters = df['label'].unique()

for i in range(len(clusters)):
    try:
        category_docs_[clusters[i]] = df[df['label']==clusters[i]]['text'].values
    except:
        category_docs_[clusters[i]] = df[df['label']==clusters[i]]['Text'].values

labels = {}
for key, category_docs in category_docs_.items():
    label = extract_labels(category_docs)
    print(key, label)
    labels[key] = label

7 charged_card_payment
1 need_currency_exchange_rate
6 tried_money_transfer_account
-1 need_account_card
5 topped_account_auto_money
4 change_pin_time
0 tell_card_limit
3 want_refund_statement
2 need_identity_verification


In [20]:
print(labels)
new_dict = {}
labels_ = list(labels.keys())
labels_val = list(labels.values())
labels_df = pd.DataFrame.from_dict({"label": labels_, "labels_val": labels_val})

{7: 'charged_card_payment', 1: 'need_currency_exchange_rate', 6: 'tried_money_transfer_account', -1: 'need_account_card', 5: 'topped_account_auto_money', 4: 'change_pin_time', 0: 'tell_card_limit', 3: 'want_refund_statement', 2: 'need_identity_verification'}


In [21]:
df_label = df.join(labels_df.set_index('label'), on='label')
df_label

Unnamed: 0.1,Unnamed: 0,text,category,embedding,embeddings_np,label,labels_val
0,0,I am still waiting on my card?,card_arrival,[-3.53524163e-02 -4.21367399e-02 -2.75710248e-...,"[-0.0353524163, -0.0421367399, -0.00275710248,...",7,charged_card_payment
1,1,What can I do if my card still hasn't arrived ...,card_arrival,[ 2.25937311e-02 -1.35310916e-02 2.42646467e-...,"[0.0225937311, -0.0135310916, 0.0242646467, 0....",7,charged_card_payment
2,2,I have been waiting over a week. Is the card s...,card_arrival,[-4.60113622e-02 -1.99005213e-02 -1.47550728e-...,"[-0.0460113622, -0.0199005213, -0.00147550728,...",7,charged_card_payment
3,3,Can I track my card while it is in the process...,card_arrival,[-8.46180320e-03 -1.86706576e-02 -3.45075577e-...,"[-0.0084618032, -0.0186706576, -0.0345075577, ...",7,charged_card_payment
4,4,"How do I know if I will get my card, or if it ...",card_arrival,[-2.35078353e-02 -1.60232875e-02 8.56959447e-...,"[-0.0235078353, -0.0160232875, 0.00856959447, ...",7,charged_card_payment
...,...,...,...,...,...,...,...
9998,9998,You provide support in what countries?,country_support,[-4.46434831e-03 1.89972986e-02 3.30669992e-...,"[-0.00446434831, 0.0189972986, 0.0330669992, -...",7,charged_card_payment
9999,9999,What countries are you supporting?,country_support,[ 1.93479229e-02 -2.52135955e-02 2.79338993e-...,"[0.0193479229, -0.0252135955, 0.0279338993, -0...",7,charged_card_payment
10000,10000,What countries are getting support?,country_support,[ 7.70830084e-03 -7.47324750e-02 4.63454723e-...,"[0.00770830084, -0.074732475, 0.0463454723, -0...",7,charged_card_payment
10001,10001,Are cards available in the EU?,country_support,[ 7.93154293e-04 2.72115413e-02 -4.71094325e-...,"[0.000793154293, 0.0272115413, -0.0471094325, ...",7,charged_card_payment


In [22]:
df_clean = df_label.drop('Unnamed: 0', axis=1).drop('embedding', axis=1)
# df_clean = df_clean.rename(columns={'Text': ' text', 'embeddings_np': 'embedding'})
# df['embedding'] = df['embedding'].apply(lambda x: 
#                                        np.fromstring(
#                                        x.replace('\n','')
#                                         .replace('[','')
#                                         .replace(']','')
#                                         .replace('  ',' '), sep=' ').astype('float'))
df_clean = df_clean.rename(columns={'Text': ' text', 'embeddings_np': 'embedding'})
df_clean['embedding'] = df_clean['embedding'].apply(lambda x: list(x))
df_clean

Unnamed: 0,text,category,embedding,label,labels_val
0,I am still waiting on my card?,card_arrival,"[-0.0353524163, -0.0421367399, -0.00275710248,...",7,charged_card_payment
1,What can I do if my card still hasn't arrived ...,card_arrival,"[0.0225937311, -0.0135310916, 0.0242646467, 0....",7,charged_card_payment
2,I have been waiting over a week. Is the card s...,card_arrival,"[-0.0460113622, -0.0199005213, -0.00147550728,...",7,charged_card_payment
3,Can I track my card while it is in the process...,card_arrival,"[-0.0084618032, -0.0186706576, -0.0345075577, ...",7,charged_card_payment
4,"How do I know if I will get my card, or if it ...",card_arrival,"[-0.0235078353, -0.0160232875, 0.00856959447, ...",7,charged_card_payment
...,...,...,...,...,...
9998,You provide support in what countries?,country_support,"[-0.00446434831, 0.0189972986, 0.0330669992, -...",7,charged_card_payment
9999,What countries are you supporting?,country_support,"[0.0193479229, -0.0252135955, 0.0279338993, -0...",7,charged_card_payment
10000,What countries are getting support?,country_support,"[0.00770830084, -0.074732475, 0.0463454723, -0...",7,charged_card_payment
10001,Are cards available in the EU?,country_support,"[0.000793154293, 0.0272115413, -0.0471094325, ...",7,charged_card_payment


In [23]:
df_clean.to_csv("banking.csv")

In [24]:
df_clean

Unnamed: 0,text,category,embedding,label,labels_val
0,I am still waiting on my card?,card_arrival,"[-0.0353524163, -0.0421367399, -0.00275710248,...",7,charged_card_payment
1,What can I do if my card still hasn't arrived ...,card_arrival,"[0.0225937311, -0.0135310916, 0.0242646467, 0....",7,charged_card_payment
2,I have been waiting over a week. Is the card s...,card_arrival,"[-0.0460113622, -0.0199005213, -0.00147550728,...",7,charged_card_payment
3,Can I track my card while it is in the process...,card_arrival,"[-0.0084618032, -0.0186706576, -0.0345075577, ...",7,charged_card_payment
4,"How do I know if I will get my card, or if it ...",card_arrival,"[-0.0235078353, -0.0160232875, 0.00856959447, ...",7,charged_card_payment
...,...,...,...,...,...
9998,You provide support in what countries?,country_support,"[-0.00446434831, 0.0189972986, 0.0330669992, -...",7,charged_card_payment
9999,What countries are you supporting?,country_support,"[0.0193479229, -0.0252135955, 0.0279338993, -0...",7,charged_card_payment
10000,What countries are getting support?,country_support,"[0.00770830084, -0.074732475, 0.0463454723, -0...",7,charged_card_payment
10001,Are cards available in the EU?,country_support,"[0.000793154293, 0.0272115413, -0.0471094325, ...",7,charged_card_payment
