In [43]:
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import nltk.corpus


import gensim
from gensim import corpora
from gensim.models import CoherenceModel

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

import pandas as pd
import numpy as np

import string
import csv
import datetime

## This is a software that uses LDA model from nltk library and creates a metric 


- How to run on Linux:
    - sudo pip install -U numpy
    - sudo pip install -U gensim
    - sudo pip install -U openpyxl
    - sudo pip install -U nltk
    - cd /usr/local/lib/python3.5/dist-packages
    - sudo python -m nltk.downloader stopwords
    - sudo python -m nltk.downloader wordnet
    
- Windows:
    - pip install -U nltk
    - pip install -U numpy
    - pip install -U gensim
    - pip install -U openpyxl
    - python
    <br>
    <font color=red>
    import nltk
    <br>
    nltk.download('stopwords')
    <br>
    nltk.download('wordnet')
    </font>

In [37]:
class Arion:
    @staticmethod
    def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    @staticmethod
    def weighted_mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs(((y_true*y_pred) - (y_pred*y_pred)) / (y_true* y_pred))) * 100

In [32]:
class Research:
    @staticmethod
    def parse_xls(xls_file=None):
        if not xls_file:
            return False
        df = pd.read_excel(xls_file)
        return list(df['Abstract Note'])

    @staticmethod
    def parse_scholar(query_word='security', size=6, initial_time=None, end_time=None):
        import scholarly
        q = '/scholar?lr=lang_us&q='+query_word+'&hl=en-US&as_vis=1&as_sdt=1,5'
        if(initial_time != None):
            q = q + '&as_ylo=' + initial_time
        if(end_time != None):
            q = q +'&as_yhi=' + end_time
        searchFilter = scholarly.search_pubs_custom_url(q)
        return [next(searchFilter).bib['abstract'] for i in range(size)]
    @staticmethod
    def words_stop(stopPath):
        stop_words = set(stopwords.words('english'))
        with open(stopPath, "rb") as msw:
            my_stops = msw.read().decode('utf-8').split("\r\n")            
            stop_words.extend(my_stops)
        return stop_words
    @staticmethod    
    def clean(doc, stopPath):
        exclude = set(string.punctuation)
        lemma = WordNetLemmatizer()          
        stop = set(stopwords.words('english'))      
        my_stops = Research.words_stop(stopPath)
        stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        return " ".join([i for i in normalized.split() if i not in my_stops])
    @staticmethod
    def save(path, doc_list):
        with open(path, "wb") as file:
            doc_str = str(doc_list).encode("utf-8")
            file.write(doc_str)

In [1]:
class Cleaner:
    def __init__(self):
        self.result = []
    @staticmethod
    def clean_xls(xls_file_in=None, xls_file_out=None):
        if not xls_file_in:
            return False
        if not xls_file_out:
            xls_file_out = xls_file_in

        data = pd.read_excel(xls_file_in, index_col=0)
        data = clean_panda(data)
        data.to_excel(xls_file_out)
        
    @staticmethod
    def clean_csv(csv_file_in=None, csv_file_out=None):
        if not csv_file_in:
            return False
        if not csv_file_out:
            csv_file_out = csv_file_in

        data = pd.read_csv(csv_file_in, index_col=0)
        data = clean_panda(data)
        data.to_csv(csv_file_out)

    @staticmethod
    def clean_panda(data):
        data["abstract"] = data["abstract"].apply(lambda x: re.sub("([©]*)\.","",x))#remove copyright's
        data["abstract"] = data["abstract"].apply(lambda x: re.sub('\S*@\S*\s?', '', x))#remove emails
        data["abstract"] = data["abstract"].apply(lambda x: re.sub('\s+', ' ', x))#
        data["abstract"] = data["abstract"].apply(lambda x: re.sub("\'", ' ', x))# remove '
        return data

In [29]:
#Set variables
source = 'Papers.xlsx'
stopWords = 'BlockWords.txt'
clean_path = 'CleanArchive.txt'
log_path = 'BeforeArchive.txt'
topics = 10
words = 8

In [28]:
#prepare docs
#log = research.revoke(source)
doc_complete = Research.parse_xls(source)
doc_clean = [Research.clean(doc, stopWords).split() for doc in doc_complete]


In [33]:
#save this version and alterations
Research.save(log_path, []) #log)
Research.save(clean_path, doc_clean)

In [34]:
#dictionary and matrix
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [35]:
#lda model
LDA = gensim.models.ldamodel.LdaModel

In [36]:
#LDA result
result = LDA(doc_term_matrix, num_topics=topics, eval_every=10, chunksize=words, id2word=dictionary, passes=2)

## Metrics works

Corpus ({iterável da lista de (int, float), SciPy. SPARSE. CSC}, opcional) – fluxo de vetores de documentos ou matriz esparsa de forma (num_terms, num_documents). Se não for fornecido, o modelo é deixado não treinado (presumivelmente porque você deseja chamar Update () manualmente).

gensim.models.ldamodel.LdaModel
(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, callbacks=None, dtype=<type 'numpy.float32'>)


In [38]:
table = result.show_topics(formatted=False)

In [39]:
real = [0.01 for x in range(100)]

In [40]:
predict = []
for x in range(10):
    for y in range(10):
        predict.append(table[x][1][y][1])
predict[:6]

[0.008497698, 0.007894334, 0.007667139, 0.0076317564, 0.006856583, 0.006652674]

In [1]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


# Compute Perplexity
print('\nPerplexity: ', result.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=result, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

NameError: name 'lemmatization' is not defined

In [42]:
print(Arion.mean_absolute_percentage_error(real,predict))
print(Arion.weighted_mean_absolute_percentage_error(real,predict))

36.07335047610104
36.073350239443144
