In [1]:
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess
import texthero as hero
from gensim.models import CoherenceModel

In [2]:
folder = "../data/mgp_data/"
publication = pd.read_csv(folder+"final_mathscinet_publs_preprocessed.csv")
publication["clean_title"] = hero.clean(publication["title"])
publication["clean_title"] = hero.stem(publication["clean_title"])

In [48]:
publication.shape

(3140415, 7)

In [69]:
publication1 = publication.sample(1000000)
titles = publication1["clean_title"].values.tolist()

In [70]:
def title_to_words(titles):
    for title in titles:
        yield(gensim.utils.simple_preprocess(str(title), deacc=True))  

In [71]:
titles1 = list(title_to_words(titles))

In [72]:
# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(titles1)

# filter extreme cases out of dictionary
#dictionary.filter_extremes(no_below=5, no_above=0.5)

# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

In [73]:
vocab_length

106186

In [74]:
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.
    
    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include
    
    '''
    # create empty list to contain topics
    topics = []
    
    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words]
         
        #create empty list to contain words
        topic = []
        
        #iterate over top n words in topic
        for k,v in sorted_dict:
            #append words to topic list
            topic.append(k)
            
        #append topics to topics list    
        topics.append(topic)
    
    return topics

In [75]:
#num_topic = 5

In [None]:
cv_score = []
num_topics = []
for num_topic in range(2, 12, 2):
    bow_corpus = [dictionary.doc2bow(doc) for doc in titles1]
    
    gsdmm = MovieGroupProcess(K=num_topic, alpha=0.1, beta=0.3, n_iters=10)
    y = gsdmm.fit(titles1, vocab_length)
    doc_count = np.array(gsdmm.cluster_doc_count)
    top_index = doc_count.argsort()[-num_topic:][::-1]
    
    topics = get_topics_lists(gsdmm, top_index, 10)
    cm_gsdmm = CoherenceModel(topics=topics, 
                              dictionary=dictionary, 
                              corpus=bow_corpus, 
                              texts=titles1, 
                              coherence='c_v')

    # get coherence value
    coherence_gsdmm = cm_gsdmm.get_coherence()  

    print(coherence_gsdmm)
    num_topics.append(num_topic)
    cv_score.append(coherence_gsdmm)

In stage 0: transferred 487453 clusters with 2 clusters populated
In stage 1: transferred 449236 clusters with 2 clusters populated
In stage 2: transferred 321339 clusters with 2 clusters populated
In stage 3: transferred 141819 clusters with 2 clusters populated
In stage 4: transferred 101026 clusters with 2 clusters populated
In stage 5: transferred 94756 clusters with 2 clusters populated
In stage 6: transferred 93815 clusters with 2 clusters populated
In stage 7: transferred 93856 clusters with 2 clusters populated
In stage 8: transferred 93606 clusters with 2 clusters populated
In stage 9: transferred 93471 clusters with 2 clusters populated
0.10915769171818648
In stage 0: transferred 735606 clusters with 4 clusters populated
In stage 1: transferred 678888 clusters with 4 clusters populated
In stage 2: transferred 470570 clusters with 4 clusters populated
In stage 3: transferred 221928 clusters with 4 clusters populated
In stage 4: transferred 156920 clusters with 4 clusters popul

In [68]:
len(y)

100000

In [65]:
# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 10)


Cluster 2 : [('graph', 2207), ('problem', 1953), ('algorithm', 1790), ('method', 1425), ('optim', 1173), ('equat', 1081), ('comput', 1027), ('linear', 1025), ('function', 1014), ('approxim', 997)]

Cluster 1 : [('system', 2881), ('base', 2680), ('model', 2191), ('data', 1699), ('use', 1341), ('comput', 1315), ('design', 1131), ('servic', 1130), ('applic', 1019), ('analysi', 987)]

Cluster 6 : [('network', 4330), ('base', 2112), ('system', 1993), ('wireless', 1608), ('channel', 1321), ('effici', 1186), ('optim', 1115), ('use', 947), ('perform', 945), ('code', 923)]

Cluster 9 : [('learn', 2587), ('base', 2285), ('network', 2061), ('use', 1702), ('model', 1629), ('data', 1151), ('detect', 1133), ('neural', 977), ('recognit', 950), ('imag', 879)]

Cluster 8 : [('control', 1875), ('system', 1841), ('optim', 1408), ('base', 1391), ('model', 1292), ('time', 1115), ('network', 1093), ('algorithm', 871), ('dynam', 772), ('use', 705)]

Cluster 3 : [('base', 2120), ('imag', 2000), ('use', 1714)

In [24]:
sum(doc_count)

1000

In [25]:
top_index

array([0, 1, 3, 4, 2])

In [28]:
# evaluate model using Topic Coherence score


0.2406536495851057


In [65]:
dictionary.token2id["workshop"]

36