In [36]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [37]:
import seaborn as sns
sns.set()

In [38]:
ori_data = pd.read_csv('data/data_twitter.csv',nrows = 1000,index_col = 0)
# ori_data.head()

useful_cols = ['text','hashtags']
data = ori_data[useful_cols].copy()

In [39]:
import spacy

pipeline = spacy.load('en_core_web_sm')

import re
from spacy.language import Language

# http://emailregex.com/
email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""

# replace = [ (pattern-to-replace, replacement),  ...]
replace = [
    (r"<a[^>]*>(.*?)</a>", r"\1"),  # Matches most URLs
    (email_re, "email"),            # Matches emails
    (r"[a-zA-Z]*http[a-zA-Z]*","link"),  # substitute the useless related words "http"
    (r"(?<=\d),(?=\d)", ""),        # Remove commas in numbers
    (r"\d+", "numbr"),              # Map digits to special token <numbr>

    (r"\#[a-zA-Z]*",""), # Remove the hasg tag
    (r"\@[a-zA-Z]*",""), # Remove the hasg tag
    (r"[\t\n\r\*\.\@\,\-\/]", " "),   # Punctuation and other junk

    (r"[a-zA-Z]*numbr[a-zA-Z]*",""),  # substitute the useless related words "numbr"
    (r"[a-zA-Z]*link[a-zA-Z]*",""),  # substitute the useless related words "link"

    (r"\W"," "),                      # Matches any character that is not a word character (alphanumeric & underscore)

    (r"\b[\w]\b", ""),              # matching single character
    (r"\s+", " ")                   # Stips extra whitespace
]

data_text = data['text']

for repl in replace:
    data_text = [re.sub(repl[0], repl[1], text) for text in data_text]

@Language.component("dbpedia_14")
def dbpedia_14_preprocess(doc):
    tokens = [token for token in doc 
              if not any((token.is_stop, token.is_punct))]
    tokens = [token.lemma_.lower().strip() for token in tokens]
    tokens = [token for token in tokens if token]
    return " ".join(tokens)

pipeline.add_pipe("dbpedia_14");

In [40]:
data_text = [pipeline(doc) for doc in data_text]

vocab_size = len(set(" ".join(data_text).split(" ")))
bow_featurizer = CountVectorizer(max_features=vocab_size, max_df=0.95, min_df=0.005, stop_words='english')
tfidf_featurizer = TfidfVectorizer(max_features=vocab_size, max_df=0.95, stop_words='english')

# train data
X_bow = bow_featurizer.fit_transform(data_text)
X_tfidf = tfidf_featurizer.fit_transform(data_text)

idx2word = {idx: word for word, idx in tfidf_featurizer.vocabulary_.items()}
# type(X_bow), X_bow.shape

In [41]:
# plot function
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_top_words(model, feature_names, n_top_words):
    top_features = pd.DataFrame()
    weights = pd.DataFrame()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[::-1][:n_top_words]
        top_features[topic_idx] = [feature_names[i] for i in top_features_ind]
        weights[topic_idx] = topic[top_features_ind]

    fig = make_subplots(
        rows=1, cols=1)

    fig.add_trace(
        go.Bar(x=top_features[0],
               y=weights[0],
               name="bar",
               visible=True), 1,1)
    fig.add_trace(
        go.Bar(x=top_features[1],
               y=weights[1],
               name="bar",
               visible=False), 1,1)
    fig.add_trace(
        go.Bar(x=top_features[2],
               y=weights[2],
               name="bar",
               visible=False), 1,1)
    fig.add_trace(
        go.Bar(x=top_features[3],
               y=weights[3],
               name="bar",
               visible=False), 1,1)
    fig.add_trace(
        go.Bar(x=top_features[4],
               y=weights[4],
               name="bar",
               visible=False), 1,1)
    button1 = dict(method='update', 
                   args=[{"visible": [True, False, False, False, False] }], 
                   label=top_features[0][0] )
    button2 = dict(method='update', 
                   args=[{"visible": [False, True, False, False, False] }], 
                   label=top_features[1][0] ) 
    button3 = dict(method='update', 
                   args=[{"visible": [False, False, True, False, False] }], 
                   label=top_features[2][0] ) 
    button4 = dict(method='update', 
                   args=[{"visible": [False, False, False, True, False] }], 
                   label=top_features[3][0] ) 
    button5 = dict(method='update', 
                   args=[{"visible": [False, False, False, False, True] }], 
                   label=top_features[4][0] ) 
    fig.update_layout(width=1000, height=500,
                     updatemenus =[dict(type='buttons',
                                        buttons=[button1, button2, button3, button4, button5])])

    return(fig)

In [42]:
# Five topics and 10 keywords for each topic.
K = 5
n_top_words = 10

In [43]:
from sklearn.decomposition import NMF

model_NMF = NMF(n_components=K, init='random', random_state=0)
model_NMF.fit(X_tfidf)

plot_top_words(model_NMF, idx2word, n_top_words)