# Bookmark Clustering

## Import Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import seaborn as sns
from time import time
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
df = pd.read_pickle('Data/df_website_content.pkl')

In [None]:
df = df.drop_duplicates()

In [None]:
df = df[df['text'] != 'empty']

In [None]:
df = df.sample(frac=1)

In [None]:
df.reset_index(drop=True, inplace=True)

## K-Means

K-Means is used to establish the number of topics ‘k’ that will be passed to LDA.  The number will be determined by increasing k until the highest weighted words in each topic reach the desired level of homogeneity.  Purity will also be assessed based on the content of the documents that are closest to the cluster centers

In [None]:
# Sklearn KMeans centers the data but doing normalization explicitly here 
tf_idf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=.95)
tf_idf = tf_idf_vectorizer.fit_transform(df['text'])

In [None]:
tf_idf = normalize(tf_idf)

In [None]:
def build_models(data, cluster_lst, n_iter, jobs=4):
    model_dict = {}
    for k in cluster_lst:
        model_dict[k] = {}
        model = KMeans(n_clusters=k, n_init=n_iter, n_jobs=jobs, random_state=123)
        model_dict[k]['model'] = model
        model_dict[k]['distances'] = model.fit_transform(tf_idf)
    return model_dict

In [None]:
k_lst = np.arange(5, 120, 20)
k_lst

In [None]:
start = time()
models_dict = build_models(tf_idf, k_lst, 15, 4)
end = time()
print end - start

In [None]:
h_lst = [abs(models_dict[k]['model'].score(tf_idf)) for k in k_lst]

In [None]:
def plot_k_vs_heterogeneity(k_values, heterogeneity_values):
    plt.figure(figsize=(7,4))
    plt.plot(k_values, heterogeneity_values, linewidth=4)
    plt.xlabel('K')
    plt.ylabel('Heterogeneity')
    plt.title('K vs. Heterogeneity')
    plt.rcParams.update({'font.size': 16})
    plt.tight_layout()

In [None]:
plot_k_vs_heterogeneity(k_lst, h_lst)

In [None]:
def plt_cluster_counts(k_val, models, ticks=True, logscale=False):
    labels = models[k_val]['model'].labels_
    g = sns.barplot(x=np.arange(k_val), y=np.bincount(labels))
    if not ticks:
        g.set(xticklabels=[])
    if logscale:
        g.figure.get_axes()[0].set_yscale('log')
    else: plt.show()

Most of the bookmarks are data science, pure math, or finance.  Three major cluster groups make sense in this context.

In [None]:
plt_cluster_counts(5, models_dict)

In [None]:
plt_cluster_counts(25, models_dict)

In [None]:
plt_cluster_counts(45, models_dict, ticks=False)

### Identifying topic themes by top weighted words and doc distance to centroid

In [None]:
indx_to_word = {v:k for k,v in tf_idf_vectorizer.vocabulary_.iteritems()}

In [None]:
def print_cluster_words(df, models, word_map, k, n_words, n_docs=5, display_content=False):
    #should check if k is valid
    centroids = models[k]['model'].cluster_centers_
    for c in xrange(len(centroids)):
        print('Cluster {0:d}    '.format(c)),
        indx = centroids[c].argsort()[::-1]
        for i in xrange(n_words):
            print('{0:s}:{1:.3f}'.format(word_map[indx[i]].encode('utf-8'), centroids[c, indx[i]])),
        print('')
        
        if display_content:
            c_filter = models_dict[k]['model'].labels_ == c
            min_cluster_idx = models_dict[k]['distances'][c_filter][:,c].argsort()
            cluster_df = df['text'][c_filter]
            cluster_df.reset_index(drop=True, inplace=True)
            nearest_txt_df = cluster_df.iloc[min_cluster_idx]
       
            if len(cluster_df) >= n_docs:
                for i in xrange(n_docs):
                    text = ' '.join(nearest_txt_df.iloc[i].split(None, 25)[0:25]).encode('utf-8')
                    print('\n* {0:s}\n  {1:s}'.format(
                            text[:90], text[90:180] if len(text) > 90 else ''))
            else: print("not enough docs in group")
        print('==========================================================')

In [None]:
print_cluster_words(df, models_dict, word_map=indx_to_word, k=25, n_words=6, n_docs=8, display_content=True)

In [None]:
print_cluster_words(df, models_dict, indx_to_word, 45, 6)

With a cluster count of 45, many of the cluster are pure in content.  However, there are some clusters which have similar themes but are split into different clusters.  A cluster count between 25 and 45 will be use for LDA.

## LDA

Not all topics have a clear theme
Things to try:
try setting max_features in cv (this seemed to help)
use graphlab
modify alpha and beta in LDA
reduce the number of topics

In [None]:
cv1 = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[a-z\'A-Z-]{2,}\b', min_df=2, max_df=.95, max_features=1000)

In [None]:
tf1 = cv1.fit_transform(df['text'])

In [None]:
lda_1 = LatentDirichletAllocation(n_topics=25, max_iter=50, random_state=0)

In [None]:
t0 = time()
lda_1.fit(tf1)
print "done in {0}".format(time()-t0)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

In [None]:
print_top_words(lda_1, cv.get_feature_names(), 10)

In [None]:
def plot_word_rank(model, n_top_words):
    #topic_sums = np.sum(model.components_, axis=1)
    for idx, topic in enumerate(model.components_):
        top_nums = [model.components_[idx, i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        #plt.semilogy(range(n_top_words), top_nums)
        plt.plot(range(n_top_words), top_nums)
    plt.xlabel('Word rank')
    plt.ylabel('Word weight')
    plt.title('Word Weight of Top 100 Words in each Topic')

In [None]:
plot_word_rank(lda_1, 100)

In [None]:
def plot_top_words(model, n_top_words):
    topic_sums = np.sum(model.components_, axis=1)
    doc_probs = []
    for idx, topic in enumerate(model.components_):
        top_words = [model.components_[idx, i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        doc_probs.append(np.sum(top_words) / topic_sums[idx])
    
    g= sns.barplot(x=np.arange(len(topic_sums)), y=doc_probs)
    g.set(xticklabels=[])
    plt.title('Total Probability of Top 10 Words in each Topic')
    plt.show()

For many of the topics, the top 10 word carry most of the weight.  

In [None]:
plot_top_words(lda_1, 10)

In [None]:
df.iloc[1]

In [None]:
df.loc[1, 'text']

In [None]:
np.argsort(lda_1.transform(tf1[1]))

In [None]:
np.argsort(lda_1.transform(tf1[0]))[0,-5:]

In [None]:
filter_43 = np.argmax(lda_1.transform(tf1), axis=1) == 43

In [None]:
df_43 = df[filter_43]

In [None]:
lda_mat = lda_1.transform(tf1)

In [None]:
np.argsort(lda_mat[filter_43], axis=1)

In [None]:
df_43.loc[2,'text']