In [1]:
import nltk 
# nltk.download('all')   # no need to download all packages, uncomment it for 1st time running code
from nltk.stem import WordNetLemmatizer
import re    # regular expression
import numpy as np
import pandas as pd
from pprint import pprint  # prettier print
import gensim       # Gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

# disable warnings
import warnings
warnings.filterwarnings(action='ignore')

  from collections import Mapping


In [2]:
# load the save the english comments corpus into a pickle file as a list
pickle_in = open("english_comments.pickle","rb")
english_comments = pickle.load(pickle_in)    # loads the list of comments
pprint(english_comments[:1])

['Our stay with Marcus in Bristol was fantastic in every way! He was a great '
 'host - picking us up at the bus stop, recommending places to try, leaving '
 'plenty of pastries and other breakfast items to enjoy in the morning. The '
 'flat itself was modern, bright, clean and spacious - and best of all, right '
 "on Bristol's lovely harbourside. We will definitely stay again next time "
 "we're in Bristol - thanks again Marcus!"]


In [3]:
# put the list into dataframe to view the nature of texts 
df = pd.DataFrame(english_comments,columns=['comments'])
pprint(df.head(5))
print('\n\n')
pprint(df.info())

                                            comments
0  Our stay with Marcus in Bristol was fantastic ...
1  Marcus is a brilliant, warm and friendly host....
2  My mum Angela and I have stayed at Marcus' ama...
3  Marcus was an exceptional host. I only stayed ...
4  Marcus was welcoming, easy going and very help...



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96419 entries, 0 to 96418
Data columns (total 1 columns):
comments    96419 non-null object
dtypes: object(1)
memory usage: 753.4+ KB
None


In [4]:
# tokenize sentences and remove panctuations and emojis
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [5]:
tokenized_texts = list(sent_to_words(english_comments))
print(tokenized_texts[:2])


[['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus'], ['marcus', 'is', 'brilliant', 'warm', 'and', 'friendly', 'host', 'he', 'picked', 'us', 'up', 'from', 'the', 'railway', 'station', 'he', 'took', 'anne', 'to', 'the', 'doctor', 'and', 'drove', 'us', 'around', 'wherever', 'we', 'needed', 'to', 'go', 'in', 'bristol', 'and', 'dropped', 'us', 'back', 'at', 'the', 'railway', 'station', 'when', 'we', 'were', 'leaving', 'his', 'flat', 'is', 'very'

In [6]:
# see the most common and most rare words
from collections import  Counter    # dictionary collection
def get_words_frequency(texts):
    counter = Counter()
    for  doc in texts:
        for word in doc:
            counter[word]+=1
    return counter

word_freq = get_words_frequency(tokenized_texts)   
#pprint(word_freq) # uncomment this line to see that @ the end of the printed texts, still chinise words in there.
first10pairs = {k for k in sorted(word_freq.items(),key=lambda x: x[1],reverse=True)[:20]}
first10pairs

{('and', 186280),
 ('bristol', 26587),
 ('clean', 25628),
 ('for', 47042),
 ('great', 49954),
 ('in', 64651),
 ('is', 63608),
 ('it', 32402),
 ('location', 30508),
 ('lovely', 30187),
 ('of', 41167),
 ('place', 32685),
 ('stay', 43553),
 ('the', 154077),
 ('to', 93356),
 ('very', 59606),
 ('was', 73952),
 ('we', 42815),
 ('with', 37414),
 ('you', 26730)}

In [7]:
# to filter chinese words or get only english words
def get_english_words(documents):
    ''' filters out any non-english words'''
    english_words = [[ w for w in doc if re.match(r'[A-Z, a-z]', w)] for doc in documents]
    return english_words


fitered_english_words = get_english_words(tokenized_texts)
print(fitered_english_words[:2])

[['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus'], ['marcus', 'is', 'brilliant', 'warm', 'and', 'friendly', 'host', 'he', 'picked', 'us', 'up', 'from', 'the', 'railway', 'station', 'he', 'took', 'anne', 'to', 'the', 'doctor', 'and', 'drove', 'us', 'around', 'wherever', 'we', 'needed', 'to', 'go', 'in', 'bristol', 'and', 'dropped', 'us', 'back', 'at', 'the', 'railway', 'station', 'when', 'we', 'were', 'leaving', 'his', 'flat', 'is', 'very'

In [8]:
# see words and their frequency
words_freq=  get_words_frequency(fitered_english_words)
first10pairs = {k: words_freq[k] for k in sorted(words_freq.keys())[:20]}
first10pairs

{'aa': 4,
 'aaalso': 1,
 'aagain': 1,
 'aardman': 1,
 'aardvark': 1,
 'aaron': 3,
 'aasy': 1,
 'ab': 4,
 'aback': 2,
 'abandoned': 5,
 'abase': 1,
 'abb': 3,
 'abbey': 11,
 'abbeywood': 1,
 'abbi': 12,
 'abbiamo': 1,
 'abbie': 3,
 'abbots': 1,
 'abby': 2,
 'abc': 2}

In [9]:
# Check the most common or frequent words
most_common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
most_common_words[:20]   # first 20 words

[('and', 186280),
 ('the', 154077),
 ('to', 93356),
 ('was', 73952),
 ('in', 64651),
 ('is', 63608),
 ('very', 59606),
 ('great', 49954),
 ('for', 47042),
 ('stay', 43553),
 ('we', 42815),
 ('of', 41167),
 ('with', 37414),
 ('place', 32685),
 ('it', 32402),
 ('location', 30508),
 ('lovely', 30187),
 ('you', 26730),
 ('bristol', 26587),
 ('clean', 25628)]

In [10]:
# see the rare words 
rare_words = sorted(word_freq.items(), key=lambda x: x[1])   # NB: word_freq is a dictionary colloction
pprint(rare_words[:10])  #  top 10 rare words

[('chauffeur', 1),
 ('ireland', 1),
 ('foreward', 1),
 ('boatload', 1),
 ('markus', 1),
 ('cellular', 1),
 ('hovered', 1),
 ('saunter', 1),
 ('arranges', 1),
 ('availablity', 1)]


In [11]:
# Build the bigram & trigram models  [the model creates 2 or 3 wordy, most common occuring  phares]
bigram = gensim.models.Phrases(fitered_english_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[fitered_english_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[fitered_english_words[0]]])

['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus']


In [12]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
print('number of stop words:',len(stop_words))

# define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    data_out =[]
    for doc in texts:
            data_out.append([lemmatizer.lemmatize(word) for word in doc])
    return data_out 

number of stop words: 179


In [13]:
# remove the stopwords
data_words_nostops = remove_stopwords(fitered_english_words)
print("after removing stop words:",data_words_nostops[0])

# create bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
print("\nbigram data words:",data_words_bigrams[0])

#  lemmatize nouns
lemmatized_data = lemmatize_texts(data_words_bigrams)
print('\n\nAfter lemmatization:',lemmatized_data[:1])

after removing stop words: ['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'us', 'bus', 'stop', 'recommending', 'places', 'try', 'leaving', 'plenty', 'pastries', 'breakfast', 'items', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus']

bigram data words: ['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'us', 'bus', 'stop', 'recommending', 'places', 'try', 'leaving', 'plenty', 'pastries', 'breakfast', 'items', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus']


After lemmatization: [['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plen

In [14]:
# after removing stop words or pre-processing : check the most common and very the rare words
wf = get_words_frequency(lemmatized_data)
most_common_words = sorted(wf.items(), key=lambda x: x[1], reverse=True)
most_common_words[:20]   # top 20 most common words

[('great', 49961),
 ('stay', 43840),
 ('place', 35320),
 ('location', 30625),
 ('lovely', 30188),
 ('host', 28772),
 ('bristol', 26618),
 ('clean', 25632),
 ('room', 23979),
 ('would', 21626),
 ('really', 19666),
 ('house', 19255),
 ('nice', 18826),
 ('comfortable', 18213),
 ('recommend', 18204),
 ('good', 16698),
 ('flat', 16037),
 ('u', 13653),
 ('home', 13089),
 ('everything', 12855)]

In [15]:
# after removing stop words or pre-processing : check the rare words
wf = get_words_frequency(lemmatized_data)
rare_words = sorted(wf.items(), key=lambda x: x[1])
pprint(rare_words[:20])   # first 20 words

[('chauffeur', 1),
 ('ireland', 1),
 ('foreward', 1),
 ('markus', 1),
 ('cellular', 1),
 ('hovered', 1),
 ('saunter', 1),
 ('arranges', 1),
 ('availablity', 1),
 ('macus', 1),
 ('beet', 1),
 ('fraternity', 1),
 ('spotlesly', 1),
 ('guast', 1),
 ('vegfest', 1),
 ('deny', 1),
 ('wecolming', 1),
 ('strives', 1),
 ('morethe', 1),
 ('lovliness', 1)]


In [16]:
def remove_rare_words(texts,words_freq):
    '''removes the rare words'''
    words_data = [[w for w in doc if words_freq[w]>7] for doc in texts]
    return words_data


In [17]:
# remove the rare words from texts
words_data = remove_rare_words(lemmatized_data,wf)
print(words_data[:5])

[['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plenty', 'pastry', 'breakfast', 'item', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus'], ['marcus', 'brilliant', 'warm', 'friendly', 'host', 'picked', 'u', 'railway_station', 'took', 'anne', 'doctor', 'drove', 'u', 'around', 'wherever', 'needed', 'go', 'bristol', 'dropped', 'u', 'back', 'railway_station', 'leaving', 'flat', 'modern', 'comfortable', 'clean', 'well', 'heated', 'marcus', 'provided', 'u', 'everything', 'could', 'wish', 'wish', 'could', 'stayed', 'longer'], ['mum', 'angela', 'stayed', 'marcus', 'amazing', 'apartment', 'two', 'week', 'august', 'relocating', 'bristol', 'lovely', 'experience', 'host', 'apartment', 'extremely', 'confortable', 'located', 'nice', 'area', 'bristol', 'close',

In [18]:
# create dictionary
id2word = corpora.Dictionary(words_data)
print("number of keys:",len(id2word.keys()))
# view the first word of the dictionary
print(id2word[0])   # see word with key =0

# create corpus
corpus_texts= words_data
print(corpus_texts[:3])

# bag of words or vector spaces
corpus = [id2word.doc2bow(text) for text in corpus_texts]
print(corpus[:2])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

number of keys: 5785
best
[['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plenty', 'pastry', 'breakfast', 'item', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus'], ['marcus', 'brilliant', 'warm', 'friendly', 'host', 'picked', 'u', 'railway_station', 'took', 'anne', 'doctor', 'drove', 'u', 'around', 'wherever', 'needed', 'go', 'bristol', 'dropped', 'u', 'back', 'railway_station', 'leaving', 'flat', 'modern', 'comfortable', 'clean', 'well', 'heated', 'marcus', 'provided', 'u', 'everything', 'could', 'wish', 'wish', 'could', 'stayed', 'longer'], ['mum', 'angela', 'stayed', 'marcus', 'amazing', 'apartment', 'two', 'week', 'august', 'relocating', 'bristol', 'lovely', 'experience', 'host', 'apartment', 'extremely', 'confortable', 'located', 'nice', '

[[('best', 1),
  ('breakfast', 1),
  ('bright', 1),
  ('bristol', 3),
  ('bus', 1),
  ('clean', 1),
  ('definitely', 1),
  ('enjoy', 1),
  ('every', 1),
  ('fantastic', 1),
  ('flat', 1),
  ('great', 1),
  ('harbourside', 1),
  ('host', 1),
  ('item', 1),
  ('leaving', 1),
  ('lovely', 1),
  ('marcus', 2),
  ('modern', 1),
  ('morning', 1),
  ('next', 1),
  ('pastry', 1),
  ('picking', 1),
  ('place', 1),
  ('plenty', 1),
  ('recommending', 1),
  ('right', 1),
  ('spacious', 1),
  ('stay', 2),
  ('stop', 1),
  ('thanks', 1),
  ('time', 1),
  ('try', 1),
  ('u', 1),
  ('way', 1)]]

How does LDA work or converge?
`
For each document d, compute P( topic t | document d ) := proportion of words in document d that are assigned to topic t
For each topic t, P( word w | topic t ) := proportion of assignments to topic t that come from word w (across all documents)
For each word w, reassign topic t’, where we choose topic t’ with probability P( topic t’ | word w ) = P( topic t’ | document d ) * P( word w | topic t’ )`

In [19]:
import time
start_time = time.time()
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

done_time = time.time()

pprint(" %.3f secs" % (done_time - start_time))

' 512.442 secs'


In [20]:
# Print the 10 keyword in topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.249*"nice" + 0.219*"good" + 0.062*"value" + 0.043*"shower" + '
  '0.040*"people" + 0.036*"money" + 0.012*"window" + 0.010*"service" + '
  '0.010*"drink" + 0.010*"load"'),
 (1,
  '0.077*"bit" + 0.044*"milk" + 0.040*"booking" + 0.030*"bread" + '
  '0.030*"heating" + 0.025*"another" + 0.023*"fridge" + 0.022*"outstanding" + '
  '0.021*"inside" + 0.020*"others"'),
 (2,
  '0.104*"clean" + 0.094*"lovely" + 0.072*"really" + 0.060*"room" + '
  '0.058*"house" + 0.056*"comfortable" + 0.037*"bed" + 0.034*"home" + '
  '0.030*"friendly" + 0.026*"communication"'),
 (3,
  '0.089*"coffee" + 0.071*"couple" + 0.063*"tea" + 0.039*"especially" + '
  '0.033*"group" + 0.033*"sofa" + 0.024*"decor" + 0.024*"wanted" + '
  '0.023*"basic" + 0.023*"slept"'),
 (4,
  '0.049*"well" + 0.045*"easy" + 0.041*"city" + 0.040*"space" + 0.038*"centre" '
  '+ 0.032*"area" + 0.030*"close" + 0.030*"walk" + 0.028*"quiet" + '
  '0.027*"check"'),
 (5,
  '0.054*"u" + 0.035*"also" + 0.032*"kitchen" + 0.024*"could" + 0.024*

` NB: next steps (a)further preprocesing texts,like removing the firsrt few most common words,(b)interpretation of the topics,(c)visualizing the topics clusters, and (d) documents ranking...`
--------------------
`references`: gensim documentation 