In [1]:
!pip install umap-learn



In [2]:
!pip install fast_hdbscan



In [3]:
!pip install hdbscan



In [4]:
!pip install spacy



In [5]:
import pandas as pd
import random
import umap
import numpy as np
import hdbscan
import spacy
import collections

In [6]:
df = pd.read_csv("embedding.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,Text,embedding
0,0,I would really like to know the criteria which...,[-5.62713146e-02 4.19251900e-03 -5.34271896e-...
1,3,Just wondered which airline would give any sor...,[ 5.21067306e-02 1.28700314e-02 -6.05350286e-...
2,4,Do they offer lunch or should we eat outside?,[ 2.18346287e-02 1.04145192e-01 7.76744559e-...
3,5,When I have received my online ticket my surna...,[-3.91793856e-03 -6.24594428e-02 -3.84225533e-...
4,6,will we be classed as transit passengers and t...,[ 7.88998529e-02 -5.62446974e-02 -2.45206114e-...
...,...,...,...
18180,31946,Our cable is not working. Is this something yo...,[ 1.91644579e-02 -6.11143559e-02 2.03925353e-...
18181,31947,I just got the service installed on 10/31/15 I...,[-5.31504191e-02 -9.80995689e-03 6.49894178e-...
18182,31949,my email does not send or receive very well. T...,[-1.13274930e-02 -3.12204901e-02 8.03747699e-...
18183,31950,where r business locations near cass city Mich...,[ 5.42873256e-02 -7.12359697e-02 -4.47087847e-...


In [8]:
def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

In [9]:
def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost

In [10]:
def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in range(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        print(f"Evaluation {i}: n_neighbors {n_neighbors} n_components {n_components} min_cluster_size {min_cluster_size}")
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size, 
                                     random_state = space['random_state'])
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, 
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [11]:
space = {
    "n_neighbors": range(100, 150),
    "n_components": range(2, 3),
    "min_cluster_size": range(100, 299),
    "random_state": None
}
df["embeddings_np"] = df['embedding'].apply(lambda x: 
                                   np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' ').astype('float'))
# df["embeddings_np"] = df['embeddings_np'].apply(lambda x: np.array(x, dtype='float'))
# embeddings = [np.array(v, dtype='float') for v in embeddings]

random_use = random_search([v for v in df["embeddings_np"]], space, 10)

Evaluation 0: n_neighbors 141 n_components 2 min_cluster_size 296
Evaluation 1: n_neighbors 100 n_components 2 min_cluster_size 103
Evaluation 2: n_neighbors 121 n_components 2 min_cluster_size 119
Evaluation 3: n_neighbors 127 n_components 2 min_cluster_size 199
Evaluation 4: n_neighbors 129 n_components 2 min_cluster_size 153
Evaluation 5: n_neighbors 136 n_components 2 min_cluster_size 278
Evaluation 6: n_neighbors 101 n_components 2 min_cluster_size 216
Evaluation 7: n_neighbors 110 n_components 2 min_cluster_size 225
Evaluation 8: n_neighbors 104 n_components 2 min_cluster_size 167
Evaluation 9: n_neighbors 105 n_components 2 min_cluster_size 222


In [12]:
random_use

Unnamed: 0,run_id,n_neighbors,n_components,min_cluster_size,label_count,cost
2,2,121,2,119,9,0.079076
4,4,129,2,153,9,0.085125
5,5,136,2,278,6,0.087435
6,6,101,2,216,9,0.11669
3,3,127,2,199,9,0.124993
8,8,104,2,167,11,0.181963
7,7,110,2,225,10,0.239703
1,1,100,2,103,18,0.257795
9,9,105,2,222,13,0.280671
0,0,141,2,296,12,0.317899


In [13]:
best_clustering = random_use.iloc[0]
best_clustering

run_id                2.000000
n_neighbors         121.000000
n_components          2.000000
min_cluster_size    119.000000
label_count           9.000000
cost                  0.079076
Name: 2, dtype: float64

In [14]:
best_clusters = generate_clusters([v for v in df["embeddings_np"]],
                                   int(best_clustering['n_neighbors']),
                                   int(best_clustering['n_components']), 
                                   int(best_clustering['min_cluster_size']),
                                   random_state = None)

In [15]:
df['label'] = best_clusters.labels_
df

Unnamed: 0.1,Unnamed: 0,Text,embedding,embeddings_np,label
0,0,I would really like to know the criteria which...,[-5.62713146e-02 4.19251900e-03 -5.34271896e-...,"[-0.0562713146, 0.004192519, -0.0534271896, -0...",0
1,3,Just wondered which airline would give any sor...,[ 5.21067306e-02 1.28700314e-02 -6.05350286e-...,"[0.0521067306, 0.0128700314, -0.0605350286, 0....",13
2,4,Do they offer lunch or should we eat outside?,[ 2.18346287e-02 1.04145192e-01 7.76744559e-...,"[0.0218346287, 0.104145192, 0.0776744559, 0.02...",7
3,5,When I have received my online ticket my surna...,[-3.91793856e-03 -6.24594428e-02 -3.84225533e-...,"[-0.00391793856, -0.0624594428, -0.00384225533...",10
4,6,will we be classed as transit passengers and t...,[ 7.88998529e-02 -5.62446974e-02 -2.45206114e-...,"[0.0788998529, -0.0562446974, -0.0245206114, 0...",-1
...,...,...,...,...,...
18180,31946,Our cable is not working. Is this something yo...,[ 1.91644579e-02 -6.11143559e-02 2.03925353e-...,"[0.0191644579, -0.0611143559, 0.0203925353, -0...",4
18181,31947,I just got the service installed on 10/31/15 I...,[-5.31504191e-02 -9.80995689e-03 6.49894178e-...,"[-0.0531504191, -0.00980995689, 0.0649894178, ...",0
18182,31949,my email does not send or receive very well. T...,[-1.13274930e-02 -3.12204901e-02 8.03747699e-...,"[-0.011327493, -0.0312204901, 0.0803747699, -0...",0
18183,31950,where r business locations near cass city Mich...,[ 5.42873256e-02 -7.12359697e-02 -4.47087847e-...,"[0.0542873256, -0.0712359697, -0.0447087847, 0...",14


In [16]:
def most_common(lst, n_words):
        """
        Return most common n words in list of words

        Arguments:
            lst: list of words
            n_words: int, number of top words by frequency to return

        Returns:
            counter.most_common(n_words): a list of the n most common elements
                                          and their counts from the most
                                          common to the least

        """

        counter = collections.Counter(lst)

        return counter.most_common(n_words)

In [17]:
def extract_labels(category_docs):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns
    """
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading language model for the spaCy dependency parser\n"
              "(only required the first time this is run)\n")
        from spacy.cli import download
        download("en_core_web_sm")
        nlp = spacy.load("en_core_web_sm")

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

In [18]:
category_docs_ = {}
clusters = df['label'].unique()

for i in range(len(clusters)):
    category_docs_[clusters[i]] = df[df['label']==clusters[i]]['Text'].values

labels = {}
for key, category_docs in category_docs_.items():
    label = extract_labels(category_docs)
    print(key, label)
    labels[key] = label

0 trying_email_account
13 know_flight_airline
7 wondering_meal_flight
10 booked_ticket_passport
-1 need_ticket_flight_time
12 flying_flight_time
8 booked_seat_flight
11 print_pass_boarding
15 need_ticket_flight
9 flying_bag_luggage
1 want_mile_mileage_plan
5 need_number_phone
6 need_service_internet
14 need_train_station
4 need_channel_tv
2 need_email_cname-
3 want_bill_auto


In [19]:
print(labels)
new_dict = {}
labels_ = list(labels.keys())
labels_val = list(labels.values())
labels_df = pd.DataFrame.from_dict({"label": labels_, "labels_val": labels_val})

{0: 'trying_email_account', 13: 'know_flight_airline', 7: 'wondering_meal_flight', 10: 'booked_ticket_passport', -1: 'need_ticket_flight_time', 12: 'flying_flight_time', 8: 'booked_seat_flight', 11: 'print_pass_boarding', 15: 'need_ticket_flight', 9: 'flying_bag_luggage', 1: 'want_mile_mileage_plan', 5: 'need_number_phone', 6: 'need_service_internet', 14: 'need_train_station', 4: 'need_channel_tv', 2: 'need_email_cname-', 3: 'want_bill_auto'}


In [20]:
df_label = df.join(labels_df.set_index('label'), on='label')
df_label

Unnamed: 0.1,Unnamed: 0,Text,embedding,embeddings_np,label,labels_val
0,0,I would really like to know the criteria which...,[-5.62713146e-02 4.19251900e-03 -5.34271896e-...,"[-0.0562713146, 0.004192519, -0.0534271896, -0...",0,trying_email_account
1,3,Just wondered which airline would give any sor...,[ 5.21067306e-02 1.28700314e-02 -6.05350286e-...,"[0.0521067306, 0.0128700314, -0.0605350286, 0....",13,know_flight_airline
2,4,Do they offer lunch or should we eat outside?,[ 2.18346287e-02 1.04145192e-01 7.76744559e-...,"[0.0218346287, 0.104145192, 0.0776744559, 0.02...",7,wondering_meal_flight
3,5,When I have received my online ticket my surna...,[-3.91793856e-03 -6.24594428e-02 -3.84225533e-...,"[-0.00391793856, -0.0624594428, -0.00384225533...",10,booked_ticket_passport
4,6,will we be classed as transit passengers and t...,[ 7.88998529e-02 -5.62446974e-02 -2.45206114e-...,"[0.0788998529, -0.0562446974, -0.0245206114, 0...",-1,need_ticket_flight_time
...,...,...,...,...,...,...
18180,31946,Our cable is not working. Is this something yo...,[ 1.91644579e-02 -6.11143559e-02 2.03925353e-...,"[0.0191644579, -0.0611143559, 0.0203925353, -0...",4,need_channel_tv
18181,31947,I just got the service installed on 10/31/15 I...,[-5.31504191e-02 -9.80995689e-03 6.49894178e-...,"[-0.0531504191, -0.00980995689, 0.0649894178, ...",0,trying_email_account
18182,31949,my email does not send or receive very well. T...,[-1.13274930e-02 -3.12204901e-02 8.03747699e-...,"[-0.011327493, -0.0312204901, 0.0803747699, -0...",0,trying_email_account
18183,31950,where r business locations near cass city Mich...,[ 5.42873256e-02 -7.12359697e-02 -4.47087847e-...,"[0.0542873256, -0.0712359697, -0.0447087847, 0...",14,need_train_station


In [26]:
df_clean = df_label.drop('Unnamed: 0', axis=1).drop('embedding', axis=1)
# df_clean = df_clean.rename(columns={'Text': ' text', 'embeddings_np': 'embedding'})
# df['embedding'] = df['embedding'].apply(lambda x: 
#                                        np.fromstring(
#                                        x.replace('\n','')
#                                         .replace('[','')
#                                         .replace(']','')
#                                         .replace('  ',' '), sep=' ').astype('float'))
df_clean = df_clean.rename(columns={'Text': ' text', 'embeddings_np': 'embedding'})
df_clean

Unnamed: 0,text,embedding,label,labels_val
0,I would really like to know the criteria which...,"[-0.0562713146, 0.004192519, -0.0534271896, -0...",0,trying_email_account
1,Just wondered which airline would give any sor...,"[0.0521067306, 0.0128700314, -0.0605350286, 0....",13,know_flight_airline
2,Do they offer lunch or should we eat outside?,"[0.0218346287, 0.104145192, 0.0776744559, 0.02...",7,wondering_meal_flight
3,When I have received my online ticket my surna...,"[-0.00391793856, -0.0624594428, -0.00384225533...",10,booked_ticket_passport
4,will we be classed as transit passengers and t...,"[0.0788998529, -0.0562446974, -0.0245206114, 0...",-1,need_ticket_flight_time
...,...,...,...,...
18180,Our cable is not working. Is this something yo...,"[0.0191644579, -0.0611143559, 0.0203925353, -0...",4,need_channel_tv
18181,I just got the service installed on 10/31/15 I...,"[-0.0531504191, -0.00980995689, 0.0649894178, ...",0,trying_email_account
18182,my email does not send or receive very well. T...,"[-0.011327493, -0.0312204901, 0.0803747699, -0...",0,trying_email_account
18183,where r business locations near cass city Mich...,"[0.0542873256, -0.0712359697, -0.0447087847, 0...",14,need_train_station


In [27]:
df_clean.to_csv("test2.csv")

In [23]:
df_clean

NameError: name 'df_clean' is not defined