In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
import networkx as nx
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize

%matplotlib inline

In [3]:
nltk.download('wordnet')
nltk.download('punkt')
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anjaneyatripathi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anjaneyatripathi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def load_text(file_name):
    
    with open(file_name, 'r') as file:
        text = file.read()
        
    sentences = sent_tokenize(text)
    
    return sentences

In [5]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [6]:
def create_bow_dictionary(sentences):
    
    # preprocessing the sentences by stemming them
    processed_docs = []
    for doc in sentences:
        processed_docs.append(preprocess(doc))
        
    # forming the dictionary
    dictionary = gensim.corpora.Dictionary(processed_docs)
    
    # creating the BOWs 
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    
    return dictionary, bow_corpus

In [7]:
# creating the LDA model
def create_model(topics, bow_corpus, dictionary):
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = topics, id2word = dictionary, passes = 10, workers = 2)
    
    return lda_model

In [8]:
# parsing the result of the topics generated
def extract_keywords(s):
    
    words=[]
    status=False
    word=''
    for ch in s:
        if(ch=='"'):
            status = not status;
        elif(ch=='+'):
            status=False
            words.append(word)
            word=''
        elif(status):
            word+=ch
    words.append(word)
    
    return words

In [9]:
def classify_topics(topics, lda_model, sentences):

    keyword_dict = {}

    for i in range(topics):
        keyword_dict[i] = extract_keywords(lda_model.print_topics(-1)[i][1])
        
    TOPICS = {}

    for i in range(topics):
        TOPICS[i] = []
    
    for sent in sentences:
        processed = preprocess(sent)
        occurences = [0] * topics
        for word in processed:
            for i in range(topics):
                if(word in keyword_dict[i]):
                    occurences[i]+=1
        top_hit = max(occurences)
        for i in range(topics):
            if(occurences[i]==top_hit):
                TOPICS[i].append(sent)
    
    return keyword_dict, TOPICS

In [10]:
def get_topics(file_name, topics):
    # enter the name of the file and the number of topics
    sentences = load_text(file_name)

    # create the dictionary and BOW corpus
    dictionary, bow_corpus = create_bow_dictionary(sentences)

    # create the LDA model
    lda_model = create_model(topics, bow_corpus, dictionary)

    # get the keywords of each topic, sentences in each topic
    keyword_dict, TOPICS = classify_topics(topics, lda_model, sentences)

    for i in range(topics):
        print('Topic: ',i)
        print(TOPICS[i])
        
    return TOPICS

In [11]:
get_topics('docs/text2.txt', 2)

Topic:  0
['The Securities and Exchange Commission today charged Arrayit Corporation, a Sunnyvale, California company, with making false and misleading statements about the development of a COVID-19 test, and with failing to file required financial reports.', "The SEC also charged Arrayit's co-founder and Chief Executive Officer, Rene Schena, with making misrepresentations to investors about the financial reports, and for her role as a control person of the company.", "The SEC's complaint alleges that, in March and April 2020, Arrayit falsely stated to investors that it had developed a COVID-19 blood test.", 'In fact, as alleged, at the time, Arrayit had not yet purchased materials needed to make a test.', 'The complaint further alleges that Arrayit falsely asserted to investors that it had submitted the test for emergency approval and that there was a high demand for the test.', 'Additionally, the complaint alleges that, between October 2018 and March 2019, Arrayit issued a series of 

{0: ['The Securities and Exchange Commission today charged Arrayit Corporation, a Sunnyvale, California company, with making false and misleading statements about the development of a COVID-19 test, and with failing to file required financial reports.',
  "The SEC also charged Arrayit's co-founder and Chief Executive Officer, Rene Schena, with making misrepresentations to investors about the financial reports, and for her role as a control person of the company.",
  "The SEC's complaint alleges that, in March and April 2020, Arrayit falsely stated to investors that it had developed a COVID-19 blood test.",
  'In fact, as alleged, at the time, Arrayit had not yet purchased materials needed to make a test.',
  'The complaint further alleges that Arrayit falsely asserted to investors that it had submitted the test for emergency approval and that there was a high demand for the test.',
  'Additionally, the complaint alleges that, between October 2018 and March 2019, Arrayit issued a series