In [1]:
import os.path
from gensim import corpora
from gensim.models import LdaMulticore
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import wordnet
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [38]:
import fitz
import os
def text_from_pdf(folder):
    docs=[]
    files=os.listdir(folder)
    for i in range(len(files)):
        filename= os.path.join(folder, files[i])
        with fitz.open(filename) as doc:
            for page in doc:
                text = page.get_text() 
                text = text.lower().replace("\n"," ").replace("  ", " ")
                docs.append(text)
    return docs

docs=text_from_pdf("/Users/SB6282engie.com/Documents/Policy documents/Bristol")


In [47]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = list(set(stopwords.words('english')))+["bristol", "city","policy","strategy","strategies"]
    # Create p_stemmer of class PorterStemmer
    lemmatizer = wordnet.WordNetLemmatizer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        tokens = tokenizer.tokenize(i)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop and not i.isdigit() and len(i)>2]
        # stem tokens
        stemmed_tokens = [lemmatizer.lemmatize(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts
clean_docs= preprocess_data(docs)

In [48]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix
dic,matrix = prepare_corpus(clean_docs)

In [49]:
import time
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    print("Start compute")
    model_list = []
    s1=time.time()
    for num_topics in range(start, stop, step):
        s=time.time()
        print(int((num_topics-start)/step), " / ", len(range(start, stop, step)),"last step took ", s-s1, "s")
        s1=s
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dic, matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    print('highest coherence for : ', np.argmax(coherence_values))
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()
start,stop,step=1,30,1
plot_graph(clean_docs,start,stop,step)

Start compute
0  /  29 last step took  1.6689300537109375e-06 s


NameError: name 'LsiModel' is not defined

In [None]:
def create_gensim_lda_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LdaMulticore(doc_term_matrix, num_topics=number_of_topics, id2word = words)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=7))
    return lsamodel
number_of_topics=16
lsamodel = create_gensim_lda_model(clean_docs, number_of_topics, dic)

In [68]:
SDG=["Poverty", "hunger", "health","education","gender equality","water","energy","economic","equality", "sustainable","production","climate","life", "peace"]





In [69]:
from sense2vec import Sense2Vec 
import numpy as np
s2v = Sense2Vec().from_disk("/Users/SB6282engie.com/Downloads/s2v_old")
def extract_words_topics(lsamodel, number_of_topics):
    L=[]
    for i in range(number_of_topics):
        if len(L) == 0 or len(L[-1])>0:
            L.append([])
        topic=lsamodel.show_topic(i)
        for j in range(len(topic)):
            if topic[j][1]>0:
                L[-1].append(s2v.get_best_sense(topic[j][0]))
    return L
word_topics=extract_words_topics(lsamodel, number_of_topics)

In [70]:

similarity_matrix=np.zeros((len(SDG), len(word_topics)))
for i in range(len(SDG)):
    SDGlist = SDG[i].split(" ")
    for j in range(len(SDGlist)):
        SDGlist[j] = s2v.get_best_sense(SDGlist[j])
    for j in range(len(word_topics)):
        similarity_matrix[i,j] = s2v.similarity(SDGlist, word_topics[j])

In [72]:
topics_names = [SDG[i] for i in np.argmax(similarity_matrix, axis=0)]
for i in range(len(topics_names)):
    print(i, topics_names[i], word_topics[i])

0 sustainable ['strategy|NOUN', 'space|NOUN', 'people|NOUN', 'local|ADJ', 'plan|VERB', 'council|NOUN', 'community|NOUN', 'development|NOUN', 'green|ADJ', 'transport|NOUN']
1 sustainable ['development|NOUN', 'plan|VERB', 'local|ADJ', 'new|ADJ', 'strategy|NOUN', 'policy|NOUN', 'transport|NOUN', 'business|NOUN', 'climate|NOUN', 'waste|VERB']
2 sustainable ['development|NOUN', 'people|NOUN', 'area|NOUN', 'transport|NOUN', 'local|ADJ', 'space|NOUN', 'community|NOUN', 'plan|VERB', 'new|ADJ', 'need|VERB']
3 sustainable ['people|NOUN', 'need|VERB', 'strategy|NOUN', 'area|NOUN', 'local|ADJ', 'community|NOUN', 'service|NOUN', 'new|ADJ', 'space|NOUN', 'development|NOUN']
4 energy ['waste|VERB', 'people|NOUN', 'strategy|NOUN', 'need|VERB', 'local|ADJ', 'service|NOUN', 'provision|NOUN', 'space|NOUN', 'centre|NOUN', 'green|ADJ']
5 climate ['strategy|NOUN', 'development|NOUN', 'local|ADJ', 'plan|VERB', 'change|VERB', 'climate|NOUN', 'centre|NOUN', 'need|VERB', 'area|NOUN', 'housing|NOUN']
6 sustainab

In [75]:
np.argmax(similarity_matrix[:,:], axis=0)

array([ 9,  9,  9,  9,  6, 11,  9,  3,  9,  9, 12,  9, 12,  9, 12,  7])

In [66]:
np.shape(SDG),np.shape(similarity_matrix)

((15,), (15, 16))

In [None]:
for i in s2v:
    print(i)