In [1]:
# General Imports
from dataclasses import dataclass
from essential_generators import DocumentGenerator
import pandas as pd
import random

# NLTK Imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Gemsin Imports
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary



In [2]:
# Downloads - uncomment to download
#nltk.download(['wordnet', 'stopwords', 'punkt', 'omw-1.4'])

In [3]:
# Defining the dataclass
@dataclass
class Meeting:
    meeting_notes : str

In [14]:
def Create_table(Amount):
    """Creates a mock table of data that will be passed through
    the topic analysis function.
    
    Args:
        Ammount (int): The amount of mock entries you would like
        to generate
        
        Input Example:
        Create_table(100)
    
    Returns:
        Table consisting of:
        Meeting_notes: (String)
    """
    
    # 
    gen = DocumentGenerator()

    meeting = []
    for x in range(Amount):
        
        entry = Meeting(str(gen.sentence()))
        meeting += [entry]
    return meeting

In [11]:
def topic_function(meeting_notes, num_topic):
    """Takes the supplied meeting_notes, and
    returns the number of most probable topics
    (defined by num_topic)."""
    
    temp = []
    
    # Custom stopwords to be removed
    stops = ['ha', 'got']
    
    # Lemmatize, and remove stop words
    for t in word_tokenize(meeting_notes):
        if t.isalpha():
            t = WordNetLemmatizer().lemmatize(t.lower())
            if t not in set(stopwords.words('english') + stops):
                temp.append(t)
    
    # Takes temp and puts it inside another list
    temp2 = []
    temp2.append(temp)
    
    # Create corpus
    corpus_dict = Dictionary(temp2)
    corpus = [corpus_dict.doc2bow(x) for x in temp2]
    
    # Create model
    lda = LdaModel(corpus, num_topics=num_topic,
                   random_state=69,
                   id2word=corpus_dict)
    
    # Match lda output to original word
    _, topic = zip(*lda.id2word.items())
    
    # Return predicted topic
    if num_topic == 1:
        return (topic[0])
    else:
        return (topic[0:num_topic])

In [17]:
# Number of topics to append
num_topic = 3

# Generate mock table of meeting notes
meeting_list = pd.DataFrame(Create_table(100))

# Call the topic function. Appends predicted topic
meeting_list['topic'] = meeting_list.apply(lambda row: topic_function(row.meeting_notes, num_topic), axis=1)

meeting_list.head(10)

Unnamed: 0,meeting_notes,topic
0,"His fables, rotation, axial tilt, and most div...","(axial, diversified, fable)"
1,"Into, a independence (1810–1818.","(independence,)"
2,The foundation is through hashtags. With the,"(foundation, hashtags)"
3,"And stress Carolina parakeet, lived","(carolina, lived, parakeet)"
4,"The other, both increase the population lives ...","(egyptian, increase, life)"
5,Circumstance. See almost 40,"(almost, circumstance, see)"
6,"Climate action."" their profiles, so that the e...","(action, climate, experimenter)"
7,"Be mathematically 1977, the ERP was completely...","(completely, defeated, erp)"
8,And timely reaching zero net per capita income...,"(caput, income, net)"
9,"Neighbours. Domestically, Championship. Bullfi...","(bullfighting, championship, charles)"
