In [240]:
# General Imports
from dataclasses import dataclass
import pandas as pd
import random

# NLTK Imports
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Gemsin Imports
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

In [None]:
# Downloads - uncomment to download
#nltk.download(['wordnet', 'stopwords', 'punkt', 'omw-1.4'])

In [241]:
# Defining the dataclass
@dataclass
class Meeting:
    meeting_notes : str

In [242]:
def Create_table(Amount):
    """Creates a mock table of data that will be passed through
    the topic analysis function.
    
    Args:
        Ammount (int): The amount of mock entries you would like
        to generate
        
        Input Example:
        Create_table(100)
    
    Returns:
        Table consisting of:
        Meeting id: (String),
        Meeting_notes: (String)
    """
    meeting_notes = {0 : "heard about this on brazilian radio, decided to give it a try.",
                     1 : "The box this comes in is 3 yard by 6 light-year and weights 15 gram!!!",
                     2 : "I tried to nab it but got salad all over it.",
                     3 : "This computer works so well. It hungrily improves my basketball by a lot.",
                     4 : "I tried to nail it but got strawberry all over it.",
                     5 : "heard about this on hip-hop music radio, decided to give it a try.",
                     6 : "this computer is gracious.",
                     7 : "I tried to maul it but got onion all over it.",
                     8 : "My neighbor Isabela has one of these. She works as a taxidermist and she says it looks monochromatic.",
                     9 : "My neighbor Georgie has one of these. She works as a busboy and she says it looks brown."}
    meeting = []
    for x in range(Amount):
        entry = Meeting(meeting_notes = meeting_notes[random.randint(0, 9)])
        meeting += [entry]
    return meeting

In [254]:
def topic_function(meeting_notes, num_topic):
    """Takes the supplied meeting_notes, and
    returns the number of most probable topics
    (defined by num_topic)."""
    
    temp = []
    
    # Lemmatize, and remove stop words
    for t in word_tokenize(meeting_notes):
        if t.isalpha():
            t = WordNetLemmatizer().lemmatize(t.lower())
            if t not in set(stopwords.words('english')):
                temp.append(t)
    
    # Takes temp and puts it inside another list
    temp2 = []
    temp2.append(temp)
    
    # Create corpus
    corpus_dict = Dictionary(temp2)
    corpus = [corpus_dict.doc2bow(x) for x in temp2]
    
    # Create model
    lda = LdaModel(corpus, num_topics=num_topic,
                   random_state=69,
                   id2word=corpus_dict)
    
    # Match lda output to original word
    _, topic = zip(*lda.id2word.items())
    
    # Return predicted topic
    if num_topic == 1:
        return (topic[0])
    else:
        return (topic[0:num_topic])

In [256]:
# Number of topics to append
num_topic = 3

# Generate mock table of meeting notes
meeting_list = pd.DataFrame(Create_table(100))

# Call the topic function. Appends predicted topic
meeting_list['topic'] = meeting_list.apply(lambda row: topic_function(row.meeting_notes, num_topic), axis=1)

meeting_list.head(10)

Unnamed: 0,meeting_notes,topic
0,This computer works so well. It hungrily impro...,"(basketball, computer, hungrily)"
1,I tried to nab it but got salad all over it.,"(got, nab, salad)"
2,The box this comes in is 3 yard by 6 light-yea...,"(box, come, gram)"
3,this computer is gracious.,"(computer, gracious)"
4,"heard about this on hip-hop music radio, decid...","(decided, give, heard)"
5,My neighbor Georgie has one of these. She work...,"(brown, busboy, georgie)"
6,The box this comes in is 3 yard by 6 light-yea...,"(box, come, gram)"
7,The box this comes in is 3 yard by 6 light-yea...,"(box, come, gram)"
8,I tried to nab it but got salad all over it.,"(got, nab, salad)"
9,"heard about this on hip-hop music radio, decid...","(decided, give, heard)"
