In [1]:
import re
import time
import spacy
import gensim
import pickle
import numpy as np
import pandas as pd
import texthero as hero
from pprint import pprint
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaMulticore

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# # Plotting tools
# import pyLDAvis
# import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
#from dask import dataframe as dd

In [4]:
folder = "../data/mgp_data/"

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [10]:
def title_to_words(titles):
    for title in titles:
        yield(gensim.utils.simple_preprocess(str(title), deacc=True))  # deacc=True removes punctuations

In [7]:
# # See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])

In [30]:
def bigram_trigram_model(data_words, th=100):
    bigram  = gensim.models.Phrases(data_words, min_count=1, threshold=th) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=th)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod  = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod

In [7]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

def make_bigrams(text, bigram_mod):
    return bigram_mod[text]

def make_trigrams(text, trigram_mod):
    return trigram_mod[bigram_mod[text]]

def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(text)) 
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return texts_out

In [8]:
def prepare_text_for_lda(titles):
    title_words = list(title_to_words(titles))
    #del titles
    #print("Done First")
    bigram_mod, trigram_mod = bigram_trigram_model(title_words)
    #print("Done Second")
    title_words_nostops = list(map(remove_stopwords,title_words))
    del title_words
    #print("Done Third")
    title_words_bigrams = [make_bigrams(text, bigram_mod) for text in title_words_nostops]
    del title_words_nostops
    #print("Done Fourth")
    #title_lemmatized = list(map(lemmatization, title_words_bigrams))
    title_lemmatized = title_words_bigrams
    del title_words_bigrams
    #print("Finished")
    return title_lemmatized

In [11]:
def create_corpus(data_lemmatized):
    id2word = corpora.Dictionary(data_lemmatized)
    id2word.filter_extremes(no_below=10, no_above=0.5)#no_below=2
    texts  = data_lemmatized
    corpus = [id2word.doc2bow(text) for text in texts]
    return (id2word, texts, corpus)

In [12]:
#pprint(lda_model.print_topics())

In [13]:
def predict(lda_model, id2word, preprocess, new_doc):
    new_doc = preprocess(new_doc)
    new_doc_bow = [id2word.doc2bow(doc) for doc in new_doc]
    topic_dist = lda_model.get_document_topics(new_doc_bow)
    return topic_dist

In [14]:
#topic_dists = predict(lda_model, id2word, prepare_text_for_lda, titles[8:16])

In [15]:
# new_doc = ['Practical Bayesian Optimization of Machine Learning Algorithms']
# new_doc = prepare_text_for_lda(new_doc)
# new_doc_bow = [id2word.doc2bow(doc) for doc in new_doc]
# print(new_doc_bow)
# print(list(lda_model.get_document_topics(new_doc_bow)))

In [16]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=5, step=5, coherence="c_v"):
    coherence_values = []
    #model_list = []
    perplexity = []
    topics = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, 
                             num_topics=num_topics, workers = 8, passes=10, random_state=100)
        #model_list.append(model)
        perplexity.append(model.log_perplexity(corpus))
        topics.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
        coherence_values.append(coherencemodel.get_coherence())
    return perplexity, coherence_values, topics

In [17]:
def save_model(lda_model, id2word, corpus, topics=25, add_info=""):
    lda_model.save(f'./topic_models/topic_model_mathscinet_{topics}_{add_info}.gensim')
    id2word.save(f'./topic_models/dictionary_mathscinet_{topics}_{add_info}.gensim')
    pickle.dump(corpus, open(f'./topic_models/corpus_mathscinet_{topics}_{add_info}.pkl', 'wb'))
    print("Model saved")
    return

In [18]:
def plot(perplexity, coherence_values, topics, num_topics=25, add_info=""):
    plt.figure()
    fig, ax = plt.subplots()
    ax.plot(topics, coherence_values, color='red')#label="c_v",
    ax.tick_params(axis='y', labelcolor='red')
    ax.set_xlabel('Number of Topics')
    ax.set_ylabel('C_V Score')
    #ax.legend()
    
    ax2 = ax.twinx()
    ax2.plot(topics, perplexity,  color='green')#label="perplexity",
    ax2.tick_params(axis='y', labelcolor='green')
    ax2.set_ylabel('Log Perplexity')
    #ax2.legend()
    
    plt.savefig(f'score_mathscinet_{num_topics}_{add_info}.pdf')
    plt.show()
    return 

In [19]:
def plot_coherence(coherence_values, topics, label="c_v", num_topics=25, add_info=""):
    plt.figure()
    _= plt.plot(topics, coherence_values, label=label)
    _= plt.xlabel('Number of Topics')
    _= plt.ylabel('C_V Score')
    plt.legend()
    plt.savefig(f"score_{label}_mathscinet_{num_topics}_{add_info}.pdf")
    plt.show()

In [20]:
def plot_perplexity(perplexity, topics, label="Perplexity",num_topics=25, add_info=""):
    plt.figure()
    _= plt.plot(topics, perplexity, label=label)
    _= plt.xlabel('Number of Topics')
    _= plt.ylabel('Log Perplexity')
    plt.legend()
    plt.savefig(f"score_{label}_mathscinet_{num_topics}_{add_info}.pdf")
    plt.show()

In [21]:
def publicaton_data(filename = "final_mathscinet_publs_included_remaining_publ_processed.pkl", stem_title=True):       #final_mathscinet_publs_processed.pkl (previous file)
    publication = pd.read_pickle(folder+filename)
    print(f"publication count: {publication.shape[0]}")
    publication["clean_title"] = hero.clean(publication["title"])
    if stem_title:
        publication["clean_title"] = hero.stem(publication["clean_title"])
    publication1 = publication.groupby(["author_id","publication_year"])['clean_title'].apply(list).reset_index(name='yearwise_titles').copy()
    print(f"Publication group by (author_id, year) count: {publication1.shape[0]}")
    return publication1

In [22]:
def load_topic_model(lda_location='./topic_models/topic_model_mathscinet_15_with_all.gensim', dictionary_location='./topic_models/dictionary_mathscinet_15_with_all.gensim', corpus_location='./topic_models/corpus_mathscinet_15_with_all.pkl', corpus_load=False):
    lda_model = LdaMulticore.load(lda_location)
    id2word = corpora.Dictionary.load(dictionary_location)
    if corpus_load:
        corpus = pickle.load(open('./topic_models/corpus_mathscinet_15_with_all.pkl', 'rb'))
        return lda_model, id2word, corpus
    return lda_model, id2word

In [23]:
#a = load_topic_model()

In [31]:
#a[1].num_docs

In [32]:
#a[0].get_topics().shape

In [1]:
if __name__ == "__main__":
    print("Started...")
    start_time = time.time()
#    publication = pd.read_pickle(folder+"final_mathscinet_publs_included_remaining_publ_processed.pkl")
#     print(f"Publication count: {publication.shape[0]}")
#     publication["clean_title"] = hero.clean(publication["title"])
#     publication["clean_title"] = hero.stem(publication["clean_title"])
#     publication1 = publication.groupby(["author_id","publication_year"])['clean_title'].apply(list).reset_index(name='yearwise_titles').copy()
#     print(f"Publication group by year count: {publication1.shape[0]}")
    #del publication
    #publication = publication.sample(1000)
    publication1 = publicaton_data("final_mathscinet_publs_included_remaining_publ_processed.pkl",stem_title=True)
    titles = [" ".join(year_titles) for year_titles in publication1["yearwise_titles"].values.tolist()]
    del publication1
    print(f"Time taken (in minutes)= {(time.time() - start_time)/60}")
    data_lemmatized = prepare_text_for_lda(titles)
    
    id2word, texts, corpus = create_corpus(data_lemmatized)
    print(f"Time taken (in minutes)= {(time.time() - start_time)/60}")
    print("Training...")
    num_topics = 15
    add_info= "with_all"
    lda_model = LdaMulticore(corpus=corpus,
                               id2word=id2word,
                               num_topics=num_topics,
                               random_state=109,
                               chunksize=1000,
                               passes=10,
                               minimum_probability=0,
                               workers = 8,)
    print(f"Time taken (in minutes)= {(time.time() - start_time)/60}")
    
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  #lower is better
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Time taken (in minutes)= {(time.time() - start_time)/60}")
    print('\n Coherence Score: ', coherence_lda)
    print("Optimal number of topic calculation...")
    perplexity, coherence_values, topics = compute_coherence_values(id2word, corpus, texts, 35)
    plot(perplexity, coherence_values, topics, num_topics, add_info)
    plot_perplexity(perplexity, topics, "perplexity", num_topics, add_info)
    plot_coherence(coherence_values, topics, "c_v", num_topics, add_info)
    save_model(lda_model, id2word, corpus, num_topics, add_info)
    print(f"Time taken (in minutes)= {(time.time() - start_time)/60}")

In [77]:
# str1 = "The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America consisting of 50 states, a federal district, five major unincorporated territories, nine Minor Outlying Islands,[i] and 326 Indian reservations. It is the world's third-largest country by both land and total area.[c] It shares land borders with Canada to its north and with Mexico to its south and has maritime borders with the Bahamas, Cuba, Russia, and other nations.[j] With a population of over 333 million,[k] it is the most populous country in the Americas and the third-most populous in the world. The national capital of the United States is Washington, D.C., and its most populous city and principal financial center is New York City."

In [None]:
#lda_model, id2word = load_topic_model()

In [None]:
#lda_model.show_topics(num_topics=25)

In [None]:
#coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')

In [None]:
#coherence_lda = coherence_model_lda.get_coherence()

In [None]:
#coherence_lda

In [None]:
#publication = pd.read_pickle(folder+"final_mathscinet_publs_processed.pkl")

In [None]:
#publication1 = publication.groupby(["author_id","publication_year"])['title'].apply(list).reset_index(name='yearwise_titles').copy()

In [None]:
#publication1.sample(5)

In [None]:
#publication1[["author_id","yearwise_titles"]].head().values

In [None]:
#titles[0:5]

In [None]:
#save_model(lda_model, id2word, corpus)

In [None]:
#perplexity+len(perplexity)*[10]
#coherence_values

In [28]:
#plot(perplexity, coherence_values, topics, num_topics, add_info)

In [29]:
#coherence_values

In [30]:
#perplexity

In [100]:
aa= pd.Series(["The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America consisting of 50 states, a federal district, five major unincorporated territories, nine Minor Outlying Islands,[i] and 326 Indian reservations. It is the world's third-largest country by both land and total area.[c] It shares land borders with Canada to its north and with Mexico to its south and has maritime borders with the Bahamas, Cuba, Russia, and other nations.[j] With a population of over 333 million,[k] it is the most populous country in the Americas and the third-most populous in the world. The national capital of the United States is Washington, D.C., and its most populous city and principal financial center is New York City."])

In [102]:
hero.preprocessing.clean(aa).values

array(['united states america u usa commonly known united states u us america country primarily located north america consisting states federal district five major unincorporated territories nine minor outlying islands indian reservations world third largest country land total area c shares land borders canada north mexico south maritime borders bahamas cuba russia nations j population million k populous country americas third populous world national capital united states washington c populous city principal financial center new york city'],
      dtype=object)