In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from wikiapi import WikiApi
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import gensim
from gensim import utils
import itertools
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import re

In [None]:
def review_to_wordlist( review, remove_stopwords=True ):
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if len(words) == 0:
        words = ['NULL']
    return( " ".join(words ))
import numpy as np

In [None]:
location = "gnm_articles.csv"
data = pd.read_csv(location)

In [None]:
num_articles = data["article_text"].size
documents_text = []
for i in range( 0, num_articles):
    documents_text.append( review_to_wordlist( data["article_text"][i] ) )
    
se = pd.Series(documents_text)
data['cleaned_text'] = se
data.to_csv('cleaned_articles.csv')

In [None]:
cleaned_data = pd.read_csv('cleaned_articles.csv')
documents_id = cleaned_data.article_id
documents_url = cleaned_data.article_url
documents_text = cleaned_data.cleaned_text

In [None]:
documents = documents_text
no_features = 10000
no_topics = 20
no_top_words = 30

In [None]:
"""
Summary:
    Displays all the relevant topics related to a given topic name
    
Approaches Used:
    - Non-Matrix Factorization(NMF)
    - Latent Dirichlet Allocation(LDA)
"""
def display_relevant_topics(model, feature_names, no_top_words):
    feature_names_list = []
    topic_id_list = []
    for topic_idx, topic in enumerate(model.components_):
        topic_id = topic_idx
        features = " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]) 
        topic_id_list.append(topic_id)
        feature_names_list.append(features)
    
    topic_df = pd.DataFrame({'Topic_ID':topic_id_list,'Topics':feature_names_list})
    return topic_df

    
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

#Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
new_nmf = nmf.fit(tfidf)
nmf_topic_df = display_relevant_topics(new_nmf, tfidf_feature_names, no_top_words)
nmf_topic_df.to_csv("NMF_Results.csv")

W = nmf.fit_transform(tfidf)
H = nmf.components_ 

In [None]:
# LDA can only use raw term counts because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, max_features=no_features, stop_words='english', min_df=2)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

#Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online',\
                                learning_offset=50.,random_state=1)
lda_new = lda.fit(tf)
lda_topic_df = display_relevant_topics(lda_new, tf_feature_names, no_top_words)
lda_topic_df.to_csv("LDA_Results.csv")

W1 = lda.fit_transform(tf)
H1 = lda.components_ 

In [None]:
"""
    Searches through a list of keywords and returns keywords based on article headers
    in Wikipedia.    

    args:
    *  keywords: A list of keywords
    *  search_depth: how many wikipedia search results are checked, assumes to be between 1-10
    *  keyword_summary: gensim word argument to how many words should be used in summarization
"""
def get_relevant_articles(keywords, search_depth=5, keyword_summary=5):
    if len(keywords) == 0:
        return []
    wiki = WikiApi()

    keywords = [x.lower() for x in keywords]
    info = []
    for keyword in keywords:
        results = wiki.find(keyword)
        other_words = [x for x in keywords if x != keyword]
        
        if search_depth is not None:
            results = results[:search_depth]

        for result in results:
            article = wiki.get_article(result)
            summary_words = article.summary.lower().split(' ')
            has_words = any(word in summary_words for word in other_words)

            if has_words:
                info.append(article.heading)

    try:
        info_keyword = gensim.summarization.keywords(' '.join(info),
                    words=keyword_summary).split('\n')
    except:
        print("Keyword extraction failed, defaulting to article heading output")
        info_keyword = info[:]
    return info_keyword

"""lemmatize a list of strings"""
def lemmatize_all(docs):
    def lemmatize_single(doc):
        result = utils.lemmatize(doc)
        return [x[:-3] for x in result]    
    return list(set(itertools.chain.from_iterable([lemmatize_single(x) for x in docs])))


In [None]:
def get_relevant_topic_names(df):
    data = df.Topics.tolist()
    for item in data:
        value = str(item).split()
        possible_topic_ids = get_relevant_articles(value)
        #print("Possible topic names suggested by Wikipedia:", possible_topic_ids)
        print(lemmatize_all(possible_topic_ids))

In [None]:
"""
    Get relevant topic names for top 20 topic words identified by using LDA approach
"""
get_relevant_topic_names(lda_topic_df)

In [None]:
"""
    Get relevant topic names for top 20 topic words identified by using NMF approach
"""
get_relevant_topic_names(nmf_topic_df)

In [None]:
def topic_parse(vec, H, n_top_words = 20):
    '''
    Connects actual terms and n-grams to the features of each topic
        for visualization.

    INPUT:  vectorizer object - vec, 2d numpy array - H, int - n_top_words
    OUTPUT: dict - topics_dicts (most important terms for each topic)
    '''
    topics_dicts = []
    n_topics = H.shape[0]

    for i in range(n_topics):
        k, v = zip(*sorted(zip(vec.get_feature_names(), H[i]),
                           key=lambda x: x[1])[:-n_top_words:-1])
        val_arr = np.array(v)
        norms = val_arr / np.sum(val_arr)
        topics_dicts.append(dict(zip(k, norms * 100)))
    return topics_dicts

topic_dicts = topic_parse(tfidf_vectorizer, H, no_top_words)
lda_topic_dicts = topic_parse(tf_vectorizer, H1, no_top_words)

In [None]:
"""NMF approach"""
article_alltopics = []
article_topic = []
for i in range(len(documents_text)):
    tfidf_a = tfidf_vectorizer.transform([documents_text[i]])
    topic_a = tfidf_a * H.T
    topic_index = topic_a.argmax()
    article_alltopics.append(list(topic_a[0]))
    article_topic.append(topic_index)

In [None]:
d = {'article_id':list(documents_id),
    'article_url':list(documents_url),
    'article_text':list(documents_text),
    'Pro_topics':article_alltopics,
    'topic_index':article_topic}

In [None]:
finaldf = pd.DataFrame(data = d)

In [None]:
finaldf.to_csv('cleaned_article_topic_nmf.csv')

In [None]:
topic_dicts
topic_d = {'hot topics':topic_dicts}
topic_df = pd.DataFrame(data=topic_d)
topic_df.to_csv('cleaned_article_topic_dict_nmf.csv')

In [None]:
"""LDA approach"""

article_alltopics = []
article_topic = []
for i in range(len(documents_text)):
    tf_a = tf_vectorizer.transform([documents_text[i]])
    topic_a = tf_a * H1.T
    topic_index = topic_a.argmax()
    article_alltopics.append(list(topic_a[0]))
    article_topic.append(topic_index)

In [None]:
d = {'article_id':list(documents_id),
    'article_url':list(documents_url),
    'article_text':list(documents_text),
    'Pro_topics':article_alltopics,
    'topic_index':article_topic}

In [None]:
finaldf_lda = pd.DataFrame(data = d)

In [None]:
finaldf_lda.to_csv('cleaned_article_topic_lda.csv')

In [None]:
lda_topic_dicts
topic_lda = {'hot topics':lda_topic_dicts}
topic_df_lda = pd.DataFrame(data=topic_lda)
topic_df_lda.to_csv('cleaned_article_topic_dict_lda.csv')