get example text

In [1]:
f = open("test_doc.txt", "r")
sam1 = f.read()

sam1

"Vanity is an interesting word. If you look up vanity in the dictionary, you'll discover that it can mean both excessive pride and something that is empty, futile, or without value. It's intriguing to think that we can be proud of something that matters very little. But this does happen sometimes, especially when it comes to business metrics. In fact, those of us in business intelligence have a term for this phenomenon: vanity metrics. Vanity metrics are data points that are intended to impress others but are not indicative of actual performance and therefore cannot reveal any meaningful business insights. A well-known vanity metric is the number of people following a company on social media. Maybe there are hundreds of thousands of followers but how many of them are actually making a purchase, how many of them refer other customers to the site, and how much revenue do they actually generate for the business? Showing off a number just because it's big, rarely accomplishes much. And tha

Text preprocessing

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

def topic_mod_text_preprocess(text, lemmatization=True, return_list=True):
    #instantiating English module
    nlp = spacy.load('en_core_web_sm')

    #creating doc object containing our token features
    doc = nlp(text)

    # tokenization, lemmatization and removing stop words.
    if lemmatization:
        tokens = [token.lemma_ for token in doc if not(token.is_stop) and not(token.is_punct)]
    else:
        tokens = [token.text for token in doc if not(token.is_stop) and not(token.is_punct)]

    if return_list:
        return tokens
    else:
        return " ".join(tokens)
    
tokens = topic_mod_text_preprocess(sam1, lemmatization=True)
tokens

['Vanity',
 'interesting',
 'word',
 'look',
 'vanity',
 'dictionary',
 'discover',
 'mean',
 'excessive',
 'pride',
 'futile',
 'value',
 'intriguing',
 'think',
 'proud',
 'matter',
 'little',
 'happen',
 'especially',
 'come',
 'business',
 'metric',
 'fact',
 'business',
 'intelligence',
 'term',
 'phenomenon',
 'vanity',
 'metric',
 'vanity',
 'metric',
 'datum',
 'point',
 'intend',
 'impress',
 'indicative',
 'actual',
 'performance',
 'reveal',
 'meaningful',
 'business',
 'insight',
 'know',
 'vanity',
 'metric',
 'number',
 'people',
 'follow',
 'company',
 'social',
 'medium',
 'maybe',
 'hundred',
 'thousand',
 'follower',
 'actually',
 'make',
 'purchase',
 'refer',
 'customer',
 'site',
 'revenue',
 'actually',
 'generate',
 'business',
 'show',
 'number',
 'big',
 'rarely',
 'accomplish',
 'critical',
 'ensure',
 'metric',
 'monitor',
 'productive',
 'informative',
 'effective',
 'example',
 'useful',
 'business',
 'metric',
 'include',
 'restaurant',
 'customer',
 'loya

embeding words

In [3]:
from gensim.models import KeyedVectors
from os import path
import numpy as np

model_path = path.abspath('models/GoogleNews-vectors-negative300.bin')
embed = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [4]:
vecs = np.array([embed.get_vector(token) for token in tokens])

In [5]:
vecs.shape

(296, 300)

visualization

In [27]:
import plotly.express as px
import umap

# dimensity reduction using UMAP
reducer = umap.UMAP(n_neighbors=10, min_dist=0.5)
red = reducer.fit_transform(vecs)

In [28]:
fig = px.scatter(
    x = red[:,0] , y = red[:,1],
    # color="cluster_id", 
    # symbol="cluster_id",
    # size='len',
    hover_name=tokens,
    color_continuous_scale=px.colors.sequential.Turbo,
    )
fig.update_layout(
    dragmode='pan',
    height=600,
    template='plotly'
    )
fig.show()