In [7]:
dialog_sample = ["""Hi, I am having trouble with my computer. It is not working."""]
dialog_sample.append("""Hmm.. What's the behavior of your computer?""")
dialog_sample.append("""When I was playing a game, it suddenly shut down. Then I tried to turn it on again, but it didn't work.""")
dialog_sample.append("""Oh, I see. What's the model of your computer?""")
dialog_sample.append("""It's a Dell XPS 15 9570.""")
dialog_sample.append("""Hmm.. I see. What's the operating system of your computer?""")
dialog_sample.append("""It's Windows 10.""")
dialog_sample.append("""Gotya. Please wait a moment while I check the issue.""")

# dialog_sample = ' '.join(dialog_sample)

## LDA Topic Extraction

LDA is not able to output the exact topic of a document in text format. It can only distinguish different documents for different topics. For example, it can say, in the three documents, document 1 and document 2 have the same topic, but it can't output their topic 'sports'.

In details, see https://zhuanlan.zhihu.com/p/29932017.

In [50]:
import nltk
import gensim
from nltk.stem import WordNetLemmatizer

def lemmatize_stemming(text):
    stemmer = nltk.stem.PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

processed_docs = [preprocess(sentence) for sentence in dialog_sample]
dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 3, id2word = dictionary, passes = 10, workers = 2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.108*"issu" + 0.108*"check" + 0.108*"wait" + 0.108*"moment" + 0.108*"gotya" + 0.108*"window" + 0.027*"behavior" + 0.027*"model" + 0.027*"oper" + 0.027*"dell"
Topic: 1 
Words: 0.100*"troubl" + 0.100*"have" + 0.100*"dell" + 0.100*"oper" + 0.100*"behavior" + 0.100*"model" + 0.099*"work" + 0.025*"window" + 0.025*"gotya" + 0.025*"check"
Topic: 2 
Words: 0.101*"work" + 0.100*"tri" + 0.100*"suddenli" + 0.100*"game" + 0.100*"shut" + 0.100*"play" + 0.100*"turn" + 0.025*"model" + 0.025*"oper" + 0.025*"dell"


## spaCy Majority Vote on Each Sentence

A traditional implementation without using spaCy is shown below. If you want to use the spaCy one, try: https://betterprogramming.pub/extract-keywords-using-spacy-in-python-4a8415478fbf

In [48]:
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

# we want to extract the entity of each sentence in the dialog, then do a majority vote to get the final entity
votes_for_dialog = []
for sentence in dialog_sample:
    token_list = word_tokenize(sentence)
    # remove punctuations
    token_list = [re.sub(r'[^\w\s]','',token) for token in token_list]
    token_list = [token for token in token_list if token != '']
    token_list = [lemmatizer.lemmatize(token) for token in token_list]
    votes_for_sentence = []
    for token in token_list:
        # remove stopwords, ignore case
        if token.lower() in stopwords.words('english'):
            continue   
        votes_for_sentence.append(token)
    votes_for_dialog.append(votes_for_sentence)

print(votes_for_dialog)

# do a majority vote
from collections import Counter
votes = Counter([item for sublist in votes_for_dialog for item in sublist])
votes.most_common(1)

[['Hi', 'trouble', 'computer', 'working'], ['Hmm', 'behavior', 'computer'], ['wa', 'playing', 'game', 'suddenly', 'shut', 'tried', 'turn', 'nt', 'work'], ['Oh', 'see', 'model', 'computer'], ['Dell', 'XPS', '15', '9570'], ['Hmm', 'see', 'operating', 'system', 'computer'], ['Windows', '10'], ['Gotya', 'Please', 'wait', 'moment', 'check', 'issue']]


[('computer', 4)]