In [2]:
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from textblob import TextBlob
import scipy.stats as stats

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import os

  from imp import reload


In [3]:
n_topics = 0
# Define helper functions
def get_top_n_words(n_top_words, count_vectorizer, text_data):
    '''
    returns a tuple of the top n words in a sample and their 
    accompanying counts, given a CountVectorizer object and text sample
    '''
    vectorized_headlines = count_vectorizer.fit_transform(text_data.values)
    vectorized_total = np.sum(vectorized_headlines, axis=0)
    word_indices = np.flip(np.argsort(vectorized_total)[0,:], 1)
    word_values = np.flip(np.sort(vectorized_total)[0,:],1)
    
    word_vectors = np.zeros((n_top_words, vectorized_headlines.shape[1]))
    for i in range(n_top_words):
        word_vectors[i,word_indices[0,i]] = 1

    words = [word[0].encode('ascii').decode('utf-8') for 
             word in count_vectorizer.inverse_transform(word_vectors)]

    return (words, word_values[0,:n_top_words].tolist()[0])

# Define helper functions
def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)
# Define helper functions
def get_top_n_words_topic_model(n, keys, document_term_matrix, count_vectorizer):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        # print('hello {}'.format(temp_vector_sum))
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [4]:
filepath = 'data/NZ/NZ_abstract/'
filelist = os.listdir(filepath)
print(filelist)

NZ_data = pd.DataFrame()
for filename in filelist:
    raw_data = pd.read_csv(filepath + filename)
    NZ_data = pd.concat([NZ_data,raw_data],ignore_index=True)
    pass
NZ_data.info()

['maritime electrification_NZ-selected.csv', 'smart home_NZ-selected.csv', 'zero waste_NZ-selected.csv', 'green building_NZ-selected.csv', 'additive manufacturing_NZ-selected.csv', 'green energy_NZ-selected.csv', 'clean-energy_NZ-selected.csv', 'renewable energy_NZ-selected.csv', 'carbon emissions_NZ-selected.csv', 'heat recovery_NZ-selected.csv', 'wastewater treatment_NZ-selected.csv', 'energy storage_NZ-selected.csv', 'geothermal_NZ-selected.csv', 'waste reduction_NZ-selected.csv', 'energy efficient_NZ-selected.csv', 'clean technology_NZ-selected.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557 entries, 0 to 1556
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                1557 non-null   object
 1   autor                1557 non-null   object
 2   doi                  1557 non-null   object
 3   citedby_count        1557 non-null   int64 
 4   affilname            1557 non-null

In [5]:
### Remove common words.
cw = open('commonwords.txt','r')
commonwords = cw.readlines()
cw.close()
for i in range(len(commonwords)):
     commonwords[i] = commonwords[i].replace('\n','')
print(commonwords)

for index in range(len(NZ_data)):
    abstract = NZ_data.at[index,'abstract']
    abstract = abstract.replace("   "," ").replace("\n",'')
    abstract = abstract.lower()
    for word in commonwords:
        abstract = abstract.replace(word,'')
    NZ_data.at[index,'abstract'] = abstract


In [6]:
topic_num = 5
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
docs_raw = NZ_data.abstract
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=topic_num, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=topic_num, random_state=0)
lda_tfidf.fit(dtm_tfidf)



In [7]:
panel = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(panel)
pyLDAvis.save_html(panel,"NZ_data_lda_result.html")

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
