In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
import networkx as nx
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize

%matplotlib inline

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anjaneyatripathi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anjaneyatripathi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def load_text(file_name):
    
    with open(file_name, 'r') as file:
        text = file.read()
        
    sentences = sent_tokenize(text)
    
    return sentences

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [5]:
def create_bow_dictionary(sentences):
    
    # preprocessing the sentences by stemming them
    processed_docs = []
    for doc in sentences:
        processed_docs.append(preprocess(doc))
        
    # forming the dictionary
    dictionary = gensim.corpora.Dictionary(processed_docs)
    
    # creating the BOWs 
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    
    return dictionary, bow_corpus

In [6]:
# creating the LDA model
def create_model(topics, bow_corpus, dictionary):
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = topics, id2word = dictionary, passes = 10, workers = 2)
    
    return lda_model

In [7]:
# parsing the result of the topics generated
def extract_keywords(s):
    
    words=[]
    status=False
    word=''
    for ch in s:
        if(ch=='"'):
            status = not status;
        elif(ch=='+'):
            status=False
            words.append(word)
            word=''
        elif(status):
            word+=ch
    words.append(word)
    
    return words

In [8]:
def classify_topics(topics, lda_model, sentences):

    keyword_dict = {}

    for i in range(topics):
        keyword_dict[i] = extract_keywords(lda_model.print_topics(-1)[i][1])
        
    TOPICS = {}

    for i in range(topics):
        TOPICS[i] = []
    
    for sent in sentences:
        processed = preprocess(sent)
        occurences = [0] * topics
        for word in processed:
            for i in range(topics):
                if(word in keyword_dict[i]):
                    occurences[i]+=1
        top_hit = max(occurences)
        for i in range(topics):
            if(occurences[i]==top_hit):
                TOPICS[i].append(sent)
    
    return keyword_dict, TOPICS

In [16]:
def get_topics(file_name, topics):
    # enter the name of the file and the number of topics
    sentences = load_text(file_name)

    # create the dictionary and BOW corpus
    dictionary, bow_corpus = create_bow_dictionary(sentences)

    # create the LDA model
    lda_model = create_model(topics, bow_corpus, dictionary)

    # get the keywords of each topic, sentences in each topic
    keyword_dict, TOPICS = classify_topics(topics, lda_model, sentences)

    for i in range(topics):
        print('Topic: ',i)
        print(TOPICS[i])
        
    return TOPICS

In [23]:
get_topics('text3.txt', 2)

Topic:  0
['On February 1, 2021, Judge Denise L. Cote of the U.S. District Court for the Southern District of New York entered a partial judgment against David Hu, the co-founder and chief investment officer of International Investment Group (IIG), a formerly registered investment adviser, enjoining Hu from violating the antifraud provisions of the federal securities laws.', "The Commission revoked IIG's registration in November 2019.", 'The complaint, filed on July 17, 2020, charges Hu with violating Section 17(a) of the Securities Act of 1933, Section 10(b) of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder, and Sections 206(1) and 206(2) of the Investment Advisers Act of 1940.', 'Hu consented to a bifurcated settlement, agreeing to be permanently enjoined from violations of the charged provisions with monetary relief in an amount to be determined by the court at a later date upon motion of the Commission.', 'On March 30, 2020, the SEC obtained a final judgment on conse

{0: ['On February 1, 2021, Judge Denise L. Cote of the U.S. District Court for the Southern District of New York entered a partial judgment against David Hu, the co-founder and chief investment officer of International Investment Group (IIG), a formerly registered investment adviser, enjoining Hu from violating the antifraud provisions of the federal securities laws.',
  "The Commission revoked IIG's registration in November 2019.",
  'The complaint, filed on July 17, 2020, charges Hu with violating Section 17(a) of the Securities Act of 1933, Section 10(b) of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder, and Sections 206(1) and 206(2) of the Investment Advisers Act of 1940.',
  'Hu consented to a bifurcated settlement, agreeing to be permanently enjoined from violations of the charged provisions with monetary relief in an amount to be determined by the court at a later date upon motion of the Commission.',
  'On March 30, 2020, the SEC obtained a final judgment on con