# Natural Language Processing 

In [4]:
import pandas as pd
import numpy as np
import re
import pickle

from nltk.corpus import stopwords
import string
from gensim import corpora
from gensim.models import LdaModel
from gensim import models
from gensim.models import LsiModel

# system
import sys
import os
from os.path import isdir, isfile, join

unable to import 'smart_open.gcs', disabling that module


# Retrieving Meetings Transcripts 

In [5]:
import xml.etree.ElementTree as ET
import os
words_list = []
path = 'data/words'

for root, dirs, files in os.walk(path, topdown=False):
    for name in files:
        tree = ET.parse(os.path.join(root, name))
        root_doc = tree.getroot()
        for child in root_doc:
            child.attrib['word'] = child.text
            words_list.append(child.attrib)

In [6]:
# split the text and extract id, id_session, id_num
for i in words_list:
    id_st = re.split(r"(\w+)", i['{http://nite.sourceforge.net/}id'], re.I)
    id_num = re.split(r"(\w+.\w.[a-zA-Z]+)", id_st[4], re.I)

    i['id']=id_st[1]
    i['id_session']= id_st[3]
    i['id_num']=id_num[2]

In [7]:
words = pd.DataFrame(words_list).drop('{http://nite.sourceforge.net/}id', axis=1)
words = words[['id', 'id_session', 'id_num', 'word', 'starttime', 'endtime', 'punc', 'trunc', 'type', 'mispronounced', 'pron', 'errortype', 'w']]

In [8]:
words.head()

Unnamed: 0,id,id_session,id_num,word,starttime,endtime,punc,trunc,type,mispronounced,pron,errortype,w
0,EN2001a,A,0,Okay,5.57,5.94,,,,,,,
1,EN2001a,A,1,.,5.94,5.94,True,,,,,,
2,EN2001a,A,2,Does,11.09,11.25,,,,,,,
3,EN2001a,A,3,anyone,11.25,11.5,,,,,,,
4,EN2001a,A,4,want,11.5,11.65,,,,,,,


creating a list of NLTK stopwords 

In [9]:
sw =stopwords.words('english')
punk=list(string.punctuation)

In [10]:
# create a list of meeting id 
meeting_list = words['id'].unique()

In [11]:
# removing all stopwords and punctuation form all meetings
meetings=[]
for meeting in meeting_list:
    meeting_temp=words[words['id']==meeting]['word']
    meeting_temp=[str(w).lower() for w in list(meeting_temp) if not w in sw] 
    meetings.append([str(w).lower() for w in meeting_temp if not w in punk])

In [12]:
# create a dictionary and bag of words 
dictionary = corpora.Dictionary(meetings) 
corpus = [dictionary.doc2bow(text) for text in meetings]

In [13]:
# calculate the tfidf weights for the corpus
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [14]:
# create a topic model
lda = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=6)
# discover the topics associated with the 8th document of the corpus
vec_lda = lda[corpus[8]]
vec_lda

[(1, 0.012762212), (3, 0.014098733), (4, 0.5823612), (5, 0.39068413)]

In [15]:
lda.print_topics()


[(0,
  '0.000*"scroll" + 0.000*"controller" + 0.000*"criteria" + 0.000*"g_d_f_" + 0.000*"remote" + 0.000*"mushroom" + 0.000*"wheel" + 0.000*"buttons" + 0.000*"rubber" + 0.000*"voice"'),
 (1,
  '0.001*"rubber" + 0.001*"l_c_d_" + 0.000*"remote" + 0.000*"titanium" + 0.000*"banana" + 0.000*"spongy" + 0.000*"buttons" + 0.000*"sort" + 0.000*"recognition" + 0.000*"fruit"'),
 (2,
  '0.001*"remote" + 0.000*"buttons" + 0.000*"animal" + 0.000*"teletext" + 0.000*"speech" + 0.000*"menu" + 0.000*"cat" + 0.000*"l_c_d_" + 0.000*"lounge" + 0.000*"functions"'),
 (3,
  '0.001*"remote" + 0.000*"scroll" + 0.000*"rubber" + 0.000*"controller" + 0.000*"joystick" + 0.000*"l_c_d_" + 0.000*"buttons" + 0.000*"office" + 0.000*"control" + 0.000*"voice"'),
 (4,
  '0.001*"remote" + 0.001*"animal" + 0.000*"l_c_d_" + 0.000*"dog" + 0.000*"cat" + 0.000*"draw" + 0.000*"buttons" + 0.000*"favourite" + 0.000*"sort" + 0.000*"turtle"'),
 (5,
  '0.000*"l_c_d_" + 0.000*"banana" + 0.000*"remote" + 0.000*"seven" + 0.000*"wheel" + 

In [18]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [19]:
# Visualize the topics
vis = pyLDAvis.gensim.prepare(lda, corpus_tfidf, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
pyLDAvis.save_html(vis, 'lda.html')