In [1]:
!pip install umap-learn



In [2]:
!pip install fast_hdbscan



In [3]:
!pip install hdbscan



In [4]:
import pandas as pd
import random
import umap
import numpy as np
import hdbscan

In [6]:
df = pd.read_csv("embedding.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,Text,embedding
0,0,I would really like to know the criteria which...,[-5.62713146e-02 4.19251900e-03 -5.34271896e-...
1,3,Just wondered which airline would give any sor...,[ 5.21067306e-02 1.28700314e-02 -6.05350286e-...
2,4,Do they offer lunch or should we eat outside?,[ 2.18346287e-02 1.04145192e-01 7.76744559e-...
3,5,When I have received my online ticket my surna...,[-3.91793856e-03 -6.24594428e-02 -3.84225533e-...
4,6,will we be classed as transit passengers and t...,[ 7.88998529e-02 -5.62446974e-02 -2.45206114e-...
...,...,...,...
18180,31946,Our cable is not working. Is this something yo...,[ 1.91644579e-02 -6.11143559e-02 2.03925353e-...
18181,31947,I just got the service installed on 10/31/15 I...,[-5.31504191e-02 -9.80995689e-03 6.49894178e-...
18182,31949,my email does not send or receive very well. T...,[-1.13274930e-02 -3.12204901e-02 8.03747699e-...
18183,31950,where r business locations near cass city Mich...,[ 5.42873256e-02 -7.12359697e-02 -4.47087847e-...


In [8]:
def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

In [9]:
def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost

In [27]:
def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in range(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        n_neighbors = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        print(f"Evaluation {i}: n_neighbors {n_neighbors} n_components {} min_cluster_size {}")
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size, 
                                     random_state = space['random_state'])
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, 
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [28]:
space = {
    "n_neighbors": range(100, 150),
    "n_components": range(3, 10),
    "min_cluster_size": range(100, 150),
    "random_state": None
}
df["embeddings_np"] = df['embedding'].apply(lambda x: 
                                   np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' ').astype('float'))
# df["embeddings_np"] = df['embeddings_np'].apply(lambda x: np.array(x, dtype='float'))
# embeddings = [np.array(v, dtype='float') for v in embeddings]

random_use = random_search([v for v in df["embeddings_np"]], space, 10)

SystemError: CPUDispatcher(<function nn_descent at 0x0000016E8EDA2F70>) returned a result with an error set

In [None]:
random_use

In [None]:
def extract_labels(category_docs):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns
    """

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

In [None]:
random_use