### STTM topic modeling algorithm for finding trending fashions

In [1]:
!git clone https://github.com/rwalk/gsdmm.git

Cloning into 'gsdmm'...
remote: Enumerating objects: 62, done.[K
remote: Total 62 (delta 0), reused 0 (delta 0), pack-reused 62[K
Unpacking objects: 100% (62/62), 15.67 KiB | 763.00 KiB/s, done.


In [2]:
import numpy as np
import pandas as pd
import pickle
import re
import gsdmm.gsdmm
from gsdmm.gsdmm import MovieGroupProcess
from tqdm import tqdm
import gensim
from gensim.models import CoherenceModel

In [3]:
# docs = df[].to_numpy()
def dataPrep(df):
    tokens = df['LemmatizeText'].apply(lambda x : str(x).split( ))    
    dictionary = gensim.corpora.Dictionary(tokens)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    vocab_length = len(dictionary)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokens]
    return tokens,dictionary,bow_corpus,vocab_length

In [4]:
def runModel(token,i):
    mgp = MovieGroupProcess(alpha=0.1, beta=0.3, n_iters=10,K=10)
    vocab = set(x for doc in tokens for x in doc)
    n_terms = len(vocab)
    y = mgp.fit(tokens, n_terms)
    pickle.dump(mgp,open(f"chunk{i}_STTM.sav",'wb'))
    return mgp

In [5]:

# helper functions
def top_words(cluster_word_distribution, top_cluster, values):
    '''prints the top words in each cluster'''
    for cluster in top_cluster:
        sort_dicts = sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s' % (cluster, sort_dicts))
        print(' — — — — — — — — —')


def cluster_importance(mgp):
    '''returns a word-topic matrix[phi] where each value represents
    the word importance for that particular cluster;
    phi[i][w] would be the importance of word w in topic i.
    '''
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w] + beta) / (sum(n_z_w[z].values()) + V * beta)
    return phi


def topic_allocation(df, docs, mgp, topic_dict):
    '''allocates all topics to each document in original dataframe,
    adding two columns for cluster number and cluster description'''
    topic_allocations = []
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['cluster'] = topic_allocations

    df['topic_name'] = df.cluster.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))


def get_topic_name(doc, topic_dict):
    '''returns the topic name string value from a dictionary of topics'''
    topic_desc = topic_dict[doc]
    return topic_desc

In [6]:
# import library from gensim  

# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.
    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include
    '''
    # create empty list to contain topics
    topics = []
  
    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words]
         
        #create empty list to contain words
        topic = []
        
        #iterate over top n words in topic
        for k,v in sorted_dict:
            #append words to topic list
            topic.append(k)
            
        #append topics to topics list    
        topics.append(topic)
    
    return topics

In [7]:

def findCoherence(mgp,tokens,id2word,bow_corpus):
    topics = get_topics_lists(mgp, top_index, 10) 
    coherencemodel_cv = CoherenceModel(topics=topics, texts=tokens,corpus=bow_corpus, dictionary=id2word, coherence='c_v')
    coherencemodel_mass = CoherenceModel( topics=topics, texts=tokens, corpus=bow_corpus,dictionary=id2word, coherence='u_mass')
    coherencemodel_npmi = CoherenceModel(topics=topics,  texts=tokens, corpus=bow_corpus,dictionary=id2word, coherence='c_npmi')
    coherencemodel_uci = CoherenceModel( topics=topics, texts=tokens,corpus=bow_corpus, dictionary=id2word, coherence='c_uci')
    return {"c_v":coherencemodel_cv.get_coherence(),
            "u_mass":coherencemodel_mass.get_coherence(),
           "c_npmi":coherencemodel_npmi.get_coherence(),
           "c_uci":coherencemodel_uci.get_coherence()}

In [8]:
print(f"chunk2==============================================================================")
df = pd.read_csv(f"/kaggle/input/cleaneddata/2915461.csv")
# df1 = pd.read_csv(f"/kaggle/input/ukrainerussiachunks/chunk2.csv")
# df = pd.concat([df,df1])
# print(df.count())



In [9]:
df.drop_duplicates()
df.count()

Unnamed: 0.2      2915461
Unnamed: 0.1      2915461
Unnamed: 0        2915461
tweetid           2915461
tweetcreatedts    2915461
retweetcount      2915461
text              2915461
LemmatizeText     2915461
dtype: int64

In [10]:

tokens,dictionary,bow_corpus,vocab_length = dataPrep(df)
mgp = runModel(tokens,6)
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*' * 20)
# topics sorted by the number of documents they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):',
          top_index)
print('*' * 20)
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 15)


In stage 0: transferred 2582042 clusters with 10 clusters populated
In stage 1: transferred 2385982 clusters with 10 clusters populated
In stage 2: transferred 1430427 clusters with 10 clusters populated
In stage 3: transferred 885609 clusters with 10 clusters populated
In stage 4: transferred 728712 clusters with 10 clusters populated
In stage 5: transferred 677417 clusters with 10 clusters populated
In stage 6: transferred 652928 clusters with 10 clusters populated
In stage 7: transferred 636262 clusters with 10 clusters populated
In stage 8: transferred 624291 clusters with 10 clusters populated
In stage 9: transferred 615955 clusters with 10 clusters populated
Number of documents per topic : [236599 313686 277256 187625  84126 271043 183288 716611 462363 182864]
********************
Most important clusters (by number of docs inside): [7 8 1 2 5 0 3 6 9 4]
********************
Cluster 0 : [('war', 68043), ('president', 32385), ('minister', 23336), ('putin', 19878), ('said', 19086), 

In [11]:
print(findCoherence(mgp,tokens,dictionary,bow_corpus))
print("========================================================================================")

{'c_v': 0.5227149957518529, 'u_mass': -3.1472934993110955, 'c_npmi': 0.05174650235829261, 'c_uci': 0.3534568034491423}
