# NMF Topic Modeling

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [36]:
#TODO:Input a dataframe that are ai related abstracts, need variables: final_frqwds_removed
abstracts = pd.read_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/bert_ai_abstracts.csv')   

# Coherence Model to find the optimal number of topics for NMF

In [38]:
def createVars(docs):

    # Create the variables needed for NMF from df[final_frqwds_removed]: dictionary (id2word), corpus
    
    # Create Dictionary
    id2word = gensim.corpora.Dictionary(docs)

    if len(docs) <= 100000:
        id2word.filter_extremes(no_below=3, no_above=1.0,  keep_n = 100000)
        print("Use keep_n = 100,000 defalt.")
    else:
        id2word.filter_extremes(no_below=3, no_above=1.0,  keep_n = len(docs))
        print("Number of documents exceed the dafalt number of 100,000. Use the keep_n = number of document.")

    # Create Corpus (Term Document Frequency)

    #Creates a count for each unique word appearing in the document, where the word_id is substituted for the word
    corpus = [id2word.doc2bow(doc) for doc in docs]

    return id2word, corpus

In [24]:
docs = abstracts["final_frqwds_removed"]

In [39]:
docs

0         [multiprotein, y_secretase, proteolytically_cl...
1         [kissl, gene, encode, peptide, kisspeptin, bin...
2         [biophysical, basis, thermodynamics_kinetic, m...
3         [obesity, adverse_pregnancyoutcome, great, hea...
4         [local, potato, advisory, express, interest, m...
                                ...                        
690809    [pathophysiology, schizophrenia, advance, thed...
690810    [alzheimer, ad, amyotrophic_lateral_sclerosis_...
690811    [highest, mortality, acute, care, encounter, r...
690812    [paradigm, kidney, largely, stagnant, decade, ...
690813    [division, intramural, population, health, dip...
Name: final_frqwds_removed, Length: 690814, dtype: object

In [None]:
#TODO: run the following code to generate id2word and corpus
#id2word, corpus = createVars(docs)

#TODO: RENAME the file, run the code to save the output
#pickle.dump([corpus, id2word, docs], open('../../data/dspg21RnD/coherence_vars_XXXXXX.sav','wb'))

# Read in Coherence file

In [2]:
#TODO: Read in your coherence data (change the name of the file)
f = open('../../data/dspg21RnD/coherence_vars.sav', 'rb')
[corpus, id2word, docs] = pickle.load(f)
f.close()

In [4]:
docs

0         [multiprotein, y_secretase, proteolytically_cl...
1         [kissl, gene, encode, peptide, kisspeptin, bin...
2         [biophysical, basis, thermodynamics_kinetic, m...
3         [obesity, adverse_pregnancyoutcome, great, hea...
4         [local, potato, advisory, express, interest, m...
                                ...                        
690809    [pathophysiology, schizophrenia, advance, thed...
690810    [alzheimer, ad, amyotrophic_lateral_sclerosis_...
690811    [highest, mortality, acute, care, encounter, r...
690812    [paradigm, kidney, largely, stagnant, decade, ...
690813    [division, intramural, population, health, dip...
Name: final_frqwds_removed, Length: 690814, dtype: object

In [5]:
text = []

for abstract in docs:
    text.append(" ".join(abstract))

In [6]:
text[0]

'multiprotein y_secretase proteolytically_cleave intramembrane region amyloid_precursorprotein_app turn plaque alzheimer ad patient catalyticcomponent y_secretase intramembrane_aspartyl_protease iap presenilin mutation inpresenilin directly link familial onset ad member iap family signalpeptide peptidase_spp proteolyze remnant peptide beencleave peptidase biochemistry individual spp onlybegin elucidate homologue kingdom life presenilin spp exhibitsignificant sequence similarity strongly share structural catalytic feature amolecular tractable spp likely drug presenilin y_secretase express solve crystal anextremophilic bacterial spp ortholog transition analog inhibitor substratemimic drug candidate screen silico intramembraneprotease insight biochemistry intramembrane_proteolysis enable ad drug screen'

In [7]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [8]:
# create document-term matrix - TFIDF 
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3)
tf_idf = tfidf_vectorizer.fit_transform(text)

In [9]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def nmf_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics
    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics
    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the NMF model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        t1 = time.time()
        nmf_model = NMF(n_components=num_topics, random_state = i)
        nmf_model.fit_transform(doc_term_matrix)
        t2 = time.time()
        print(f"  Model time: {t2-t1}")
        
        # create list of topics
        topics = list_topics(nmf_model, vectorizer, top_n=10)
        
        # calculate coherence
        t1 = time.time()
        
        #TODO:manually adjust number of processes
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', #model for calculating coherence score
                            processes=12 #for smaller corpus, pronesses= number of cores - 1 
                           ) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        print(f"  Coherence time: {t2-t1}")
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values


In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = list(range(5,101,5)) #from 5 to 100, increment by 5
num_runs = 2

col_names = [f"iteration {i}" for i in range(num_runs)]
nmf_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = nmf_metrics(doc_term_matrix=tf_idf, n_topics=n_topics, vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = (i)*len(n_topics))
    
    # save results
    nmf_c[f"iteration {i}"] = c

Iteration 0


In [None]:
# save results 

nmf_c.to_pickle("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/nmf_bert.pkl")

# NMF

In [None]:
lim_docs = docs
len(lim_docs)

In [None]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(" ".join(token_list))

In [None]:
# Create a TF-IDF document-term matrix for the AI corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

# by default TfidfVectorizer has l2 normalization for rows: 
# from Scikit Learn documentation: Each output row will have unit norm, either: * ‘l2’: Sum of squares of vector 
# elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied.

nmf_tf_idf = nmf_vectorizer.fit_transform(text)

In [None]:
nmf_tf_idf.shape

In [None]:
AI_terms = nmf_vectorizer.get_feature_names()

In [None]:
AI_terms[1:10]

In [None]:
# topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [None]:
print_topics(nmf_model, nmf_vectorizer, 10)

In [None]:
# hot and cold figure 