In [1]:
import nltk 
# nltk.download('all')   # no need to download all packages, uncomment it for 1st time running code
from nltk.stem import WordNetLemmatizer
import re    # regular expression
import numpy as np
import pandas as pd
from pprint import pprint  # prettier print
import gensim       # Gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

# disable warnings
import warnings
warnings.filterwarnings(action='ignore')

  from collections import Mapping


In [2]:
# load the save the english comments corpus into a pickle file as a list
pickle_in = open("english_comments.pickle","rb")
english_comments = pickle.load(pickle_in)    # loads the list of comments
pprint(english_comments[:1])

['Our stay with Marcus in Bristol was fantastic in every way! He was a great '
 'host - picking us up at the bus stop, recommending places to try, leaving '
 'plenty of pastries and other breakfast items to enjoy in the morning. The '
 'flat itself was modern, bright, clean and spacious - and best of all, right '
 "on Bristol's lovely harbourside. We will definitely stay again next time "
 "we're in Bristol - thanks again Marcus!"]


In [3]:
# put the list into dataframe to view the nature of texts 
df = pd.DataFrame(english_comments,columns=['comments'])
pprint(df.head(5))
print('\n\n')
pprint(df.info())

                                            comments
0  Our stay with Marcus in Bristol was fantastic ...
1  Marcus is a brilliant, warm and friendly host....
2  My mum Angela and I have stayed at Marcus' ama...
3  Marcus was an exceptional host. I only stayed ...
4  Marcus was welcoming, easy going and very help...



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96419 entries, 0 to 96418
Data columns (total 1 columns):
comments    96419 non-null object
dtypes: object(1)
memory usage: 753.4+ KB
None


In [4]:
# tokenize sentences and remove panctuations and emojis
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [5]:
tokenized_texts = list(sent_to_words(english_comments))
print(tokenized_texts[:2])


[['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus'], ['marcus', 'is', 'brilliant', 'warm', 'and', 'friendly', 'host', 'he', 'picked', 'us', 'up', 'from', 'the', 'railway', 'station', 'he', 'took', 'anne', 'to', 'the', 'doctor', 'and', 'drove', 'us', 'around', 'wherever', 'we', 'needed', 'to', 'go', 'in', 'bristol', 'and', 'dropped', 'us', 'back', 'at', 'the', 'railway', 'station', 'when', 'we', 'were', 'leaving', 'his', 'flat', 'is', 'very'

In [21]:
# see the most common and most rare words
from collections import  Counter    # dictionary collection
def get_words_frequency(texts):
    counter = Counter()
    for  doc in texts:
        for word in doc:
            counter[word]+=1
    return counter

word_freq = get_words_frequency(tokenized_texts)
pprint(word_freq) # @ the end of the printed texts, still chinise  words in there.
first10pairs = {k: word_freq[k] for k in sorted(word_freq.items(),key=lambda x: x[1],reverse=True)[:20]}
first10pairs

Counter({'and': 186280,
         'the': 154077,
         'to': 93356,
         'was': 73952,
         'in': 64651,
         'is': 63608,
         'very': 59606,
         'great': 49954,
         'for': 47042,
         'stay': 43553,
         'we': 42815,
         'of': 41167,
         'with': 37414,
         'place': 32685,
         'it': 32402,
         'location': 30508,
         'lovely': 30187,
         'you': 26730,
         'bristol': 26587,
         'clean': 25628,
         'room': 22558,
         'would': 21626,
         'host': 20727,
         'really': 19666,
         'house': 19121,
         'nice': 18825,
         'had': 18530,
         'comfortable': 18213,
         'recommend': 18204,
         'at': 17680,
         'as': 17458,
         'good': 16679,
         'this': 16439,
         'were': 16193,
         'flat': 15887,
         'again': 14698,
         'our': 14624,
         'from': 14350,
         'on': 13921,
         'but': 13856,
         'us': 13653,
         'so'

         'communicating': 265,
         'overlooking': 264,
         'interior': 264,
         'zowie': 264,
         'wife': 263,
         'throw': 263,
         'communicated': 263,
         'promptly': 262,
         'summer': 262,
         'toys': 262,
         'butter': 260,
         'olivia': 260,
         'walks': 259,
         'jill': 259,
         'lucky': 258,
         'furniture': 258,
         'montpelier': 258,
         'eco': 258,
         'ben': 258,
         'jim': 258,
         'vibe': 257,
         'range': 257,
         'nuccia': 255,
         'alberto': 255,
         'cara': 255,
         'general': 254,
         'compact': 254,
         'necessary': 253,
         'tesco': 252,
         'understanding': 251,
         'amazingly': 251,
         'case': 251,
         'doesn': 251,
         'conversation': 250,
         'offering': 250,
         'owners': 250,
         'peter': 249,
         'ashley': 249,
         'exceptional': 248,
         'box': 247,
         'matt

         'hoped': 123,
         'relatively': 123,
         'russ': 123,
         'jason': 123,
         'rain': 122,
         'anyway': 122,
         'missing': 122,
         'electric': 122,
         'budget': 122,
         'cycle': 122,
         'unit': 121,
         'fancy': 121,
         'southville': 121,
         'cheers': 121,
         'showers': 121,
         'vegan': 121,
         'provisions': 121,
         'cons': 121,
         'except': 121,
         'closed': 121,
         'hannah': 121,
         'repeat': 120,
         'memorable': 120,
         'nd': 120,
         'fixed': 120,
         'wharf': 120,
         'turn': 120,
         'departure': 119,
         'worry': 119,
         'towel': 119,
         'frequent': 119,
         'annex': 119,
         'joy': 118,
         'weekends': 118,
         'creative': 118,
         'unusual': 118,
         'replying': 118,
         'theatre': 118,
         'soap': 118,
         'sleeper': 118,
         'positioned': 117,
        

         'elsewhere': 62,
         'boutique': 62,
         'similar': 62,
         'rush': 62,
         'aren': 62,
         'styled': 62,
         'require': 62,
         'salt': 62,
         'chosen': 62,
         'answers': 62,
         'paying': 62,
         'val': 62,
         'clive': 62,
         'pru': 62,
         'eli': 62,
         'marina': 61,
         'airb': 61,
         'accommodations': 61,
         'ticket': 61,
         'conditioner': 61,
         'wee': 61,
         'mobile': 61,
         'initially': 61,
         'busses': 61,
         'josie': 61,
         'colleagues': 61,
         'cups': 61,
         'feature': 61,
         'piano': 61,
         'instant': 61,
         'spec': 61,
         'courteous': 61,
         'bowl': 61,
         'hob': 61,
         'becki': 61,
         'naila': 61,
         'juliette': 61,
         'notes': 60,
         'concerned': 60,
         'rough': 60,
         'approx': 60,
         'respect': 60,
         'shows': 60,
         

         'susan': 37,
         'final': 37,
         'landing': 37,
         'climbing': 37,
         'oliver': 37,
         'club': 37,
         'lauren': 37,
         'nina': 37,
         'zero': 37,
         'fishponds': 37,
         'kirsten': 37,
         'gildas': 37,
         'housemate': 36,
         'safely': 36,
         'mid': 36,
         'display': 36,
         'fabulously': 36,
         'memory': 36,
         'heaven': 36,
         'red': 36,
         'icing': 36,
         'changes': 36,
         'sunset': 36,
         'varied': 36,
         'excited': 36,
         'continue': 36,
         'tenants': 36,
         'pleasing': 36,
         'trust': 36,
         'specially': 36,
         'player': 36,
         'watched': 36,
         'wealth': 36,
         'wanna': 36,
         'zone': 36,
         'yr': 36,
         'smelled': 36,
         'obvious': 36,
         'noting': 36,
         'balloons': 36,
         'drawback': 36,
         'lamp': 36,
         'occupied': 36,
  

         'active': 24,
         'soooo': 24,
         'accidentally': 24,
         'housemates': 24,
         'library': 24,
         'artefacts': 24,
         'rating': 24,
         'poco': 24,
         'edgy': 24,
         'clothing': 24,
         'possibility': 24,
         'march': 24,
         'badly': 24,
         'snow': 24,
         'pies': 24,
         'cafetiere': 24,
         'summary': 24,
         'chain': 24,
         'attraction': 24,
         'traveled': 24,
         'smelling': 24,
         'awaits': 24,
         'nobody': 24,
         'sick': 24,
         'cluttered': 24,
         'removed': 24,
         'suburban': 24,
         'asleep': 24,
         'deli': 24,
         'hide': 24,
         'hire': 24,
         'page': 24,
         'singing': 24,
         'alistair': 24,
         'respects': 24,
         'hut': 24,
         'heatwave': 24,
         'crescent': 24,
         'lone': 24,
         'types': 24,
         'iconic': 24,
         'paths': 24,
         'freel

         'sea': 17,
         'ect': 17,
         'mere': 17,
         'accomadating': 17,
         'lodger': 17,
         'lovelier': 17,
         'mentioning': 17,
         'francisca': 17,
         'vast': 17,
         'data': 17,
         'reccomended': 17,
         'ambiance': 17,
         'confidence': 17,
         'stored': 17,
         'trinity': 17,
         'height': 17,
         'renovations': 17,
         'lily': 17,
         'soo': 17,
         'washroom': 17,
         'depends': 17,
         'coded': 17,
         'guidebook': 17,
         'extreme': 17,
         'sammy': 17,
         'hiccups': 17,
         'metro': 17,
         'indicated': 17,
         'offerings': 17,
         'minus': 17,
         'ammenities': 17,
         'sarahs': 17,
         'october': 17,
         'quirks': 17,
         'strolling': 17,
         'counter': 17,
         'thermostat': 17,
         'plumbing': 17,
         'beams': 17,
         'purchase': 17,
         'cecelia': 17,
         'confy

         'knife': 13,
         'micro': 13,
         'rebooked': 13,
         'homeliness': 13,
         'crafty': 13,
         'partying': 13,
         'bryan': 13,
         'glynn': 13,
         'mews': 13,
         'punctual': 13,
         'movement': 13,
         'taller': 13,
         'incorrect': 13,
         'switches': 13,
         'connectivity': 13,
         'photography': 13,
         'pram': 13,
         'cases': 13,
         'toms': 13,
         'elisabeth': 13,
         'sizes': 13,
         'stick': 13,
         'alexander': 13,
         'michaela': 13,
         'chui': 13,
         'chrissi': 13,
         'salvador': 13,
         'muffin': 13,
         'eddy': 13,
         'billy': 13,
         'alistar': 13,
         'gregg': 13,
         'leanne': 13,
         'marcela': 13,
         'downfall': 12,
         'packing': 12,
         'lifts': 12,
         'thinks': 12,
         'presentation': 12,
         'nook': 12,
         'invasive': 12,
         'plain': 12,
     

         'caution': 10,
         'clients': 10,
         'deliver': 10,
         'hangout': 10,
         'growing': 10,
         'hoover': 10,
         'facilitate': 10,
         'arcade': 10,
         'rundown': 10,
         'nearer': 10,
         'reccommended': 10,
         'pie': 10,
         'bedminister': 10,
         'wich': 10,
         'africa': 10,
         'springs': 10,
         'arches': 10,
         'offices': 10,
         'latch': 10,
         'generic': 10,
         'cards': 10,
         'astonishing': 10,
         'stairwell': 10,
         'taps': 10,
         'duplex': 10,
         'serviceable': 10,
         'balconies': 10,
         'cobbled': 10,
         'stood': 10,
         'uneven': 10,
         'sofabed': 10,
         'neatly': 10,
         'scheduled': 10,
         'noon': 10,
         'fyi': 10,
         'bean': 10,
         'hygiene': 10,
         'cubicle': 10,
         'grubby': 10,
         'hate': 10,
         'ppl': 10,
         'dampness': 10,
       

         'groovy': 8,
         'powder': 8,
         'ornaments': 8,
         'headboard': 8,
         'mystery': 8,
         'katina': 8,
         'humans': 8,
         'meetings': 8,
         'nonsense': 8,
         'shifts': 8,
         'volumes': 8,
         'toothbrushes': 8,
         'amounts': 8,
         'nutella': 8,
         'staythanks': 8,
         'designs': 8,
         'crumbs': 8,
         'flaw': 8,
         'interviews': 8,
         'necessarily': 8,
         'gigs': 8,
         'crib': 8,
         'prettiest': 8,
         'zowies': 8,
         'unlocked': 8,
         'marred': 8,
         'superstore': 8,
         'struggling': 8,
         'improv': 8,
         'clogged': 8,
         'dim': 8,
         'iceland': 8,
         'purely': 8,
         'daunting': 8,
         'penguin': 8,
         'garbage': 8,
         'spitting': 8,
         'inadequate': 8,
         'records': 8,
         'possessions': 8,
         'deposit': 8,
         'quid': 8,
         'trapped': 8

         'pollution': 6,
         'seagull': 6,
         'fraction': 6,
         'convey': 6,
         'silver': 6,
         'lactose': 6,
         'batteries': 6,
         'overal': 6,
         'granted': 6,
         'demonstrated': 6,
         'coz': 6,
         'interrupt': 6,
         'hospitals': 6,
         'iphone': 6,
         'challenged': 6,
         'poached': 6,
         'potter': 6,
         'americans': 6,
         'cast': 6,
         'harborside': 6,
         'alicja': 6,
         'attempting': 6,
         'philosophy': 6,
         'niece': 6,
         'rambling': 6,
         'overpowering': 6,
         'flushing': 6,
         'barber': 6,
         'swans': 6,
         'laurence': 6,
         'composting': 6,
         'valentine': 6,
         'factors': 6,
         'travelodge': 6,
         'injury': 6,
         'imagination': 6,
         'muddy': 6,
         'cyclists': 6,
         'cosiness': 6,
         'creepy': 6,
         'occurred': 6,
         'handbook': 6,
    

         'score': 5,
         'householder': 5,
         'brightly': 5,
         'assuming': 5,
         'beeping': 5,
         'dos': 5,
         'ethos': 5,
         'bristolians': 5,
         'sleepover': 5,
         'economy': 5,
         'cousins': 5,
         'denmark': 5,
         'subject': 5,
         'unfinished': 5,
         'seaside': 5,
         'lanes': 5,
         'pukka': 5,
         'tradewind': 5,
         'rider': 5,
         'doorway': 5,
         'proofed': 5,
         'roaming': 5,
         'consequence': 5,
         'positioning': 5,
         'attempted': 5,
         'awoken': 5,
         'dislike': 5,
         'hopping': 5,
         'admin': 5,
         'anxiety': 5,
         'questioning': 5,
         'cecillia': 5,
         'invest': 5,
         'flare': 5,
         'recommmend': 5,
         'arguably': 5,
         'remodeled': 5,
         'christine': 5,
         'pullout': 5,
         'annie': 5,
         'solely': 5,
         'painting': 5,
         'strand

         'urgent': 4,
         'pops': 4,
         'bathe': 4,
         'invites': 4,
         'sampled': 4,
         'android': 4,
         'speedily': 4,
         'excellence': 4,
         'souvenir': 4,
         'lagged': 4,
         'joking': 4,
         'prize': 4,
         'hostels': 4,
         'cord': 4,
         'relied': 4,
         'election': 4,
         'results': 4,
         'peppermint': 4,
         'spares': 4,
         'perched': 4,
         'nose': 4,
         'glowing': 4,
         'dessert': 4,
         'skeptical': 4,
         'alert': 4,
         'greek': 4,
         'muino': 4,
         'actions': 4,
         'sticker': 4,
         'liaise': 4,
         'erratic': 4,
         'ordering': 4,
         'freshest': 4,
         'fig': 4,
         'woukd': 4,
         'pairs': 4,
         'probs': 4,
         'passive': 4,
         'appetite': 4,
         'topping': 4,
         'hell': 4,
         'wellbeing': 4,
         'exceeding': 4,
         'kerb': 4,
         'l

         'conection': 3,
         'semester': 3,
         'jean': 3,
         'hedge': 3,
         'fixings': 3,
         'intellectual': 3,
         'encroaching': 3,
         'southbank': 3,
         'baguette': 3,
         'protect': 3,
         'jus': 3,
         'goal': 3,
         'input': 3,
         'highstreet': 3,
         'giggle': 3,
         'pingpong': 3,
         'shortcomings': 3,
         'occassion': 3,
         'ties': 3,
         'pen': 3,
         'corkscrew': 3,
         'panels': 3,
         'starters': 3,
         'default': 3,
         'choccy': 3,
         'investigating': 3,
         'refined': 3,
         'likable': 3,
         'goto': 3,
         'hangouts': 3,
         'brim': 3,
         'deceptive': 3,
         'pestered': 3,
         'stool': 3,
         'refilled': 3,
         'ham': 3,
         'regent': 3,
         'deenagh': 3,
         'skipped': 3,
         'partake': 3,
         'patricias': 3,
         'certainty': 3,
         'selfmade': 3,
   

         'nathalie': 3,
         'newsagents': 3,
         'eu': 3,
         'messing': 3,
         'accomodated': 3,
         'interference': 3,
         'shelia': 3,
         'cloth': 3,
         'shiela': 3,
         'accomodative': 3,
         'christening': 3,
         'gripes': 3,
         'accustomed': 3,
         'rode': 3,
         'entrances': 3,
         'hearth': 3,
         'franco': 3,
         'goood': 3,
         'viva': 3,
         'everythig': 3,
         'giorgia': 3,
         'tripped': 3,
         'youngsters': 3,
         'pokey': 3,
         'couchsurfing': 3,
         'extortionate': 3,
         'catchup': 3,
         'trolley': 3,
         'kensington': 3,
         'aired': 3,
         'thurs': 3,
         'drunks': 3,
         'unmatched': 3,
         'fights': 3,
         'unwashed': 3,
         'easly': 3,
         'enquiring': 3,
         'minuts': 3,
         'sympathy': 3,
         'passer': 3,
         'pleace': 3,
         'passport': 3,
         'enjoi

         'coped': 2,
         'fascilities': 2,
         'replay': 2,
         'removable': 2,
         'believes': 2,
         'festivities': 2,
         'dissapointing': 2,
         'neatest': 2,
         'shaquille': 2,
         'shuttered': 2,
         'pulse': 2,
         'raging': 2,
         'lecturer': 2,
         'reside': 2,
         'odor': 2,
         'undesirable': 2,
         'consumption': 2,
         'nighter': 2,
         'modify': 2,
         'allocation': 2,
         'nightstand': 2,
         'affairs': 2,
         'busing': 2,
         'replica': 2,
         'withdrawn': 2,
         'hobby': 2,
         'paella': 2,
         'bathroon': 2,
         'obedient': 2,
         'spaciously': 2,
         'communities': 2,
         'fest': 2,
         'lately': 2,
         'periodically': 2,
         'becuase': 2,
         'wholefood': 2,
         'owning': 2,
         'swan': 2,
         'congress': 2,
         'lugging': 2,
         'choir': 2,
         'somethig': 2,
   

         'confortables': 2,
         'wouldstay': 2,
         'supervised': 2,
         'inevitability': 2,
         'kiddies': 2,
         'rik': 2,
         'loaning': 2,
         'elephant': 2,
         'frenchie': 2,
         'reall': 2,
         'comfertable': 2,
         'privat': 2,
         'pilates': 2,
         'ckean': 2,
         'restauarants': 2,
         'nighbourhood': 2,
         'airing': 2,
         'complicate': 2,
         'gerardo': 2,
         'bikeway': 2,
         'mosque': 2,
         'centuries': 2,
         'bristollian': 2,
         'coaches': 2,
         'ambassador': 2,
         'foudn': 2,
         'hippie': 2,
         'pip': 2,
         'utilising': 2,
         'brushes': 2,
         'hobgoblin': 2,
         'trudge': 2,
         'definitey': 2,
         'kitkat': 2,
         'rub': 2,
         'inadvertently': 2,
         'fries': 2,
         'bight': 2,
         'cente': 2,
         'illegal': 2,
         'minimally': 2,
         'bio': 2,
         '

         'wolfs': 2,
         'nathaniel': 2,
         'leon': 2,
         'motel': 2,
         'squeaking': 2,
         'thrills': 2,
         'aptm': 2,
         'thumps': 2,
         'accordance': 2,
         'unoccupied': 2,
         'congregating': 2,
         'restless': 2,
         'safes': 2,
         'hollywood': 2,
         'prams': 2,
         'tomas': 2,
         'gramophone': 2,
         'frequency': 2,
         'stupid': 2,
         'locationperfect': 2,
         'flap': 2,
         'racket': 2,
         'comftable': 2,
         'undeniably': 2,
         'muslims': 2,
         'providers': 2,
         'niks': 2,
         'definitivly': 2,
         'staggering': 2,
         'cody': 2,
         'deceiving': 2,
         'streetside': 2,
         'alsways': 2,
         'continuous': 2,
         'unnerved': 2,
         'insurance': 2,
         'buyer': 2,
         'clump': 2,
         'irons': 2,
         'destroyed': 2,
         'lateness': 2,
         'congealed': 2,
       

         'hispitable': 1,
         'coburg': 1,
         'yuppy': 1,
         'tings': 1,
         'lightness': 1,
         'hopw': 1,
         'chanced': 1,
         'thal': 1,
         'lottvely': 1,
         'crodt': 1,
         'antidote': 1,
         'explorative': 1,
         'cannont': 1,
         'bene': 1,
         'plass': 1,
         'thorughout': 1,
         'dealers': 1,
         'mildewy': 1,
         'progressive': 1,
         'livng': 1,
         'advocate': 1,
         'cheerfulness': 1,
         'maddie': 1,
         'diesel': 1,
         'pefectly': 1,
         'glassof': 1,
         'invades': 1,
         'omelet': 1,
         'hippiest': 1,
         'themost': 1,
         'restraurant': 1,
         'bengi': 1,
         'espically': 1,
         'trainer': 1,
         'benj': 1,
         'ammentities': 1,
         'lovelys': 1,
         'benjy': 1,
         'cello': 1,
         'tris': 1,
         'politeness': 1,
         'suger': 1,
         'thouroghly': 1,
      

         'holland': 1,
         'yippee': 1,
         'breen': 1,
         'sabina': 1,
         'recommendgreat': 1,
         'hostnot': 1,
         'monk': 1,
         'soso': 1,
         'stokecroft': 1,
         'tug': 1,
         'cms': 1,
         'xperia': 1,
         'ssid': 1,
         'browser': 1,
         'detected': 1,
         'shook': 1,
         'boxers': 1,
         'equipated': 1,
         'forwarded': 1,
         'anxieties': 1,
         'twinkling': 1,
         'server': 1,
         'reobert': 1,
         'remonu': 1,
         'fist': 1,
         'programs': 1,
         'playmate': 1,
         'cecile': 1,
         'stepahnie': 1,
         'niovi': 1,
         'definedly': 1,
         'invade': 1,
         'goodlocation': 1,
         'homly': 1,
         'clinically': 1,
         'orgnanised': 1,
         'accents': 1,
         'tabitha': 1,
         'startv': 1,
         'stephanies': 1,
         'recommned': 1,
         'fii': 1,
         'guinness': 1,
         '

         'upstaged': 1,
         'stupendously': 1,
         'charmer': 1,
         'kidnapped': 1,
         'raffles': 1,
         'placemark': 1,
         'neralie': 1,
         'rorys': 1,
         'longish': 1,
         'bffl': 1,
         'ponder': 1,
         'nuptials': 1,
         'speech': 1,
         'congrats': 1,
         'uper': 1,
         'friendlylovely': 1,
         'peckle': 1,
         'greenmore': 1,
         'fluffball': 1,
         'hugl': 1,
         'enthusiasms': 1,
         'provisors': 1,
         'oldie': 1,
         'nazdrowie': 1,
         'sharyn': 1,
         'raffa': 1,
         'helluva': 1,
         'aisha': 1,
         'technophobe': 1,
         'historically': 1,
         'homogeneous': 1,
         'flake': 1,
         'slopey': 1,
         'blockout': 1,
         'takeways': 1,
         'unappealing': 1,
         'wang': 1,
         'consistant': 1,
         'tippy': 1,
         'launchpad': 1,
         'graphicswala': 1,
         'hospitatlity': 1

         'bathrom': 1,
         'accomodaton': 1,
         'employs': 1,
         'visting': 1,
         'kr': 1,
         'skytv': 1,
         'dangerously': 1,
         'concur': 1,
         'identified': 1,
         'digest': 1,
         'basking': 1,
         'murky': 1,
         'specialises': 1,
         'emporium': 1,
         'perform': 1,
         'biiiiiig': 1,
         'ooodles': 1,
         'surpass': 1,
         'unabashedly': 1,
         'raring': 1,
         'praising': 1,
         'mariya': 1,
         'fanni': 1,
         'mastress': 1,
         'aurore': 1,
         'interviez': 1,
         'youll': 1,
         'retiree': 1,
         'dresm': 1,
         'crop': 1,
         'lavabo': 1,
         'veiw': 1,
         'rightfully': 1,
         'chickes': 1,
         'chioce': 1,
         'yolk': 1,
         'aimable': 1,
         'metting': 1,
         'grrt': 1,
         'dialect': 1,
         'intricate': 1,
         'fantatstic': 1,
         'bae': 1,
         'flavia

         'truffles': 1,
         'complient': 1,
         'disposed': 1,
         'sanne': 1,
         'malou': 1,
         'bulldogs': 1,
         'cancled': 1,
         'intesting': 1,
         'hobbity': 1,
         'hobbies': 1,
         'blowdryer': 1,
         'killed': 1,
         'harmed': 1,
         'alrpo': 1,
         'servis': 1,
         'diserves': 1,
         'howard': 1,
         'leif': 1,
         'dope': 1,
         'highland': 1,
         'mortal': 1,
         'leila': 1,
         'unswept': 1,
         'unmopped': 1,
         'hum': 1,
         'mazy': 1,
         'urself': 1,
         'adventurer': 1,
         'checkers': 1,
         'blended': 1,
         'industries': 1,
         'likeon': 1,
         'nerd': 1,
         'icredible': 1,
         'raft': 1,
         'virginity': 1,
         'parr': 1,
         'convoy': 1,
         'exeperince': 1,
         'californian': 1,
         'newspaper': 1,
         'abhishek': 1,
         'obey': 1,
         'hihgly': 

         'surburban': 1,
         'etcit': 1,
         'juraj': 1,
         'unforeseeable': 1,
         'sherlock': 1,
         'holmes': 1,
         'norbert': 1,
         'remiss': 1,
         'aria': 1,
         'syresham': 1,
         'briljant': 1,
         'monaida': 1,
         'locatoin': 1,
         'psrking': 1,
         'quant': 1,
         'tottendown': 1,
         'ads': 1,
         'yuanfeng': 1,
         'comfortablwe': 1,
         'toilette': 1,
         'caffs': 1,
         'friensly': 1,
         'perusing': 1,
         '很小巧可愛的民宿': 1,
         '所有的設備都很整齊乾淨': 1,
         '所以可以很自在的享受留宿時光': 1,
         'overs': 1,
         'clammy': 1,
         'ceareals': 1,
         'relabel': 1,
         'sucked': 1,
         'easilly': 1,
         'conspired': 1,
         'sself': 1,
         'shighly': 1,
         'hace': 1,
         'males': 1,
         'felafal': 1,
         'edition': 1,
         'multicoloured': 1,
         'respekted': 1,
         'lennard': 1,
         'matth

         'quids': 1,
         'crouds': 1,
         'crapy': 1,
         'nooooo': 1,
         'interrogation': 1,
         'turistic': 1,
         'pmso': 1,
         'cleansome': 1,
         'handicapped': 1,
         'integration': 1,
         'summed': 1,
         'director': 1,
         'yhey': 1,
         'appease': 1,
         'template': 1,
         'wount': 1,
         'conditons': 1,
         'persist': 1,
         'pissed': 1,
         'bathroomgreat': 1,
         'coffeeexcellent': 1,
         'bedwe': 1,
         'outspoken': 1,
         'funishing': 1,
         'collingwood': 1,
         'chocolatier': 1,
         'chums': 1,
         'uncleanliness': 1,
         'costcutter': 1,
         'salacious': 1,
         'fashionablw': 1,
         'ron': 1,
         'broadly': 1,
         'fin': 1,
         'precarious': 1,
         'wroking': 1,
         'locatipn': 1,
         'promixity': 1,
         'cascade': 1,
         'impart': 1,
         'surfaced': 1,
         'capucci

         'lazyboy': 1,
         'immigrated': 1,
         'ardently': 1,
         'occupiers': 1,
         'traval': 1,
         'sourdoughnuts': 1,
         'twiglet': 1,
         'reliant': 1,
         'unwide': 1,
         'acedemy': 1,
         'uphills': 1,
         'cafeter': 1,
         'accesories': 1,
         'familiarise': 1,
         'showerhead': 1,
         'alumni': 1,
         'buidling': 1,
         'horizontal': 1,
         'caramel': 1,
         'fiamma': 1,
         'shuffle': 1,
         'marthyna': 1,
         'illustration': 1,
         'bussy': 1,
         'zitto': 1,
         'bevi': 1,
         'emmeline': 1,
         'carribean': 1,
         'uncompliated': 1,
         'noosy': 1,
         'cheersjhula': 1,
         'discloses': 1,
         'exclaimed': 1,
         'ocean': 1,
         'hobgoglin': 1,
         'dockyard': 1,
         'neon': 1,
         'tigers': 1,
         'nevermind': 1,
         'cuban': 1,
         'gor': 1,
         'grimey': 1,
       

         'kindfull': 1,
         'rianne': 1,
         'stemming': 1,
         'grimm': 1,
         'snake': 1,
         'heartbroken': 1,
         'coolshe': 1,
         'humouralso': 1,
         'bristolall': 1,
         'plav': 1,
         'tijme': 1,
         'prayer': 1,
         'festively': 1,
         'cherishes': 1,
         'workable': 1,
         'cheking': 1,
         'excepcional': 1,
         'benefitted': 1,
         'themes': 1,
         'receptions': 1,
         'invitingwe': 1,
         'everytthing': 1,
         'appended': 1,
         'ide': 1,
         'deffiitely': 1,
         'ameneities': 1,
         'haring': 1,
         'touchable': 1,
         'banko': 1,
         'eaisly': 1,
         'weeked': 1,
         'lindseys': 1,
         'comfortableit': 1,
         'superstars': 1,
         'athough': 1,
         'batroom': 1,
         'reaaaaally': 1,
         'frustating': 1,
         'avis': 1,
         'eighteenth': 1,
         'unconcerned': 1,
         'attte

         'queriesthe': 1,
         'intertior': 1,
         'cask': 1,
         'copper': 1,
         'areal': 1,
         'ulla': 1,
         'crusts': 1,
         'discard': 1,
         'walt': 1,
         'polishing': 1,
         'accentuates': 1,
         'railing': 1,
         'listenening': 1,
         'slumping': 1,
         'unsporty': 1,
         'programming': 1,
         'exeptionally': 1,
         'wrist': 1,
         'aligning': 1,
         'amending': 1,
         'cemetary': 1,
         'fayre': 1,
         'rescure': 1,
         'anker': 1,
         'olli': 1,
         'gorgious': 1,
         'bleating': 1,
         'intruded': 1,
         'arte': 1,
         'kyra': 1,
         'trik': 1,
         'ilumination': 1,
         'fom': 1,
         'resister': 1,
         'nichols': 1,
         'optima': 1,
         'greant': 1,
         'temples': 1,
         'definely': 1,
         'irresponsive': 1,
         'recomnend': 1,
         'adelas': 1,
         'quirkyness': 1,
 

         'energized': 1,
         'tanx': 1,
         'elizabet': 1,
         'bends': 1,
         'familiarity': 1,
         'grandkids': 1,
         'watercress': 1,
         'ellis': 1,
         'patronising': 1,
         'lectured': 1,
         'marginally': 1,
         'againellis': 1,
         'thanksellis': 1,
         'watts': 1,
         'pressurized': 1,
         'subway': 1,
         'loughs': 1,
         'grest': 1,
         'ciyy': 1,
         'connexions': 1,
         'gulping': 1,
         'slated': 1,
         'warmwe': 1,
         'teathe': 1,
         'beand': 1,
         'locationsuited': 1,
         'situationwith': 1,
         'conservationist': 1,
         'bekkie': 1,
         'insomniac': 1,
         'chillax': 1,
         'bucks': 1,
         'tintos': 1,
         'hassling': 1,
         'solarium': 1,
         'retrieved': 1,
         'cleanjust': 1,
         'fromnorwich': 1,
         'slighter': 1,
         'kelle': 1,
         'beiing': 1,
         'courtyy

         'hassel': 1,
         'cheapish': 1,
         'noicy': 1,
         'blank': 1,
         'empting': 1,
         'recyling': 1,
         'anticpated': 1,
         'goop': 1,
         'uneducated': 1,
         'origins': 1,
         'accuse': 1,
         'isis': 1,
         'uglygood': 1,
         'centralable': 1,
         'officebadvery': 1,
         'uglyroad': 1,
         'blaring': 1,
         'thickness': 1,
         'disagreed': 1,
         'unproblematic': 1,
         'inset': 1,
         'highs': 1,
         'unpleasantly': 1,
         'eyemask': 1,
         'streetlight': 1,
         'litteraly': 1,
         'recreational': 1,
         'stingy': 1,
         'helicopter': 1,
         'whiniest': 1,
         'broadcast': 1,
         'intolerable': 1,
         'wink': 1,
         'sh': 1,
         'explanatory': 1,
         'thanksmarek': 1,
         'darkened': 1,
         'slicker': 1,
         'laundrymat': 1,
         'touchpads': 1,
         'superquicly': 1,
        

         'overstayed': 1,
         'desighned': 1,
         'kharkiv': 1,
         'ukraine': 1,
         'owh': 1,
         'doughy': 1,
         'translation': 1,
         'cleanclose': 1,
         'culdasac': 1,
         'belinda': 1,
         'bunglalow': 1,
         'mahen': 1,
         'garth': 1,
         'sentiments': 1,
         'letterbox': 1,
         'equippedlovely': 1,
         'providedin': 1,
         'downn': 1,
         'lance': 1,
         'centrehighly': 1,
         'mexicana': 1,
         'researched': 1,
         'splayed': 1,
         'lest': 1,
         'shabbier': 1,
         'disrepair': 1,
         'ysb': 1,
         'marsh': 1,
         'swell': 1,
         'squared': 1,
         'roxys': 1,
         'exceded': 1,
         'wss': 1,
         'highlighy': 1,
         'beholder': 1,
         'shanks': 1,
         'pony': 1,
         'versha': 1,
         'sarahis': 1,
         'centresarah': 1,
         'tablebthough': 1,
         'remotes': 1,
         'quiet

         'hazards': 1,
         'carseats': 1,
         'bristolcity': 1,
         'joyous': 1,
         'helpes': 1,
         'lucious': 1,
         'patrolled': 1,
         'hosthighly': 1,
         'lorraines': 1,
         'homlier': 1,
         'cleanedwe': 1,
         'planningi': 1,
         'tiresome': 1,
         'samirbis': 1,
         'replayed': 1,
         'iroom': 1,
         'pus': 1,
         'gao': 1,
         'obtainable': 1,
         'checkins': 1,
         'pollys': 1,
         'chocked': 1,
         'juno': 1,
         'graphic': 1,
         'letterpress': 1,
         '非常漂亮且大且设施齐全': 1,
         '这个价钱值得': 1,
         '唯一不好的地方就是位置在山上': 1,
         '要爬超大的坡': 1,
         'pica': 1,
         'passionately': 1,
         'tests': 1,
         'mims': 1,
         'brookes': 1,
         'spatulas': 1,
         'deckchairs': 1,
         'drys': 1,
         'chuck': 1,
         'stayin': 1,
         'waived': 1,
         'oscars': 1,
         'moonlight': 1,
         'aftered':

         'defienietly': 1,
         'overused': 1,
         'cosyclean': 1,
         'wouldhighly': 1,
         'depths': 1,
         'aheadi': 1,
         'deffinantly': 1,
         'ransacked': 1,
         'declining': 1,
         'farcical': 1,
         'ensued': 1,
         'perfunctory': 1,
         'uttered': 1,
         'entailed': 1,
         'stipulation': 1,
         'uncanny': 1,
         'baffled': 1,
         'friendlynature': 1,
         'nicegreat': 1,
         'commination': 1,
         'plying': 1,
         'angled': 1,
         'government': 1,
         'popti': 1,
         'mina': 1,
         'superdog': 1,
         'piping': 1,
         'nescafe': 1,
         'dolce': 1,
         'fastidious': 1,
         'noses': 1,
         'sniff': 1,
         'sturminster': 1,
         'iut': 1,
         'hangerscar': 1,
         'experence': 1,
         'coarse': 1,
         'meed': 1,
         'deff': 1,
         'helpers': 1,
         'lincolnshire': 1,
         'cyclical': 1

{('and', 186280): 0,
 ('the', 154077): 0,
 ('to', 93356): 0,
 ('was', 73952): 0,
 ('in', 64651): 0,
 ('is', 63608): 0,
 ('very', 59606): 0,
 ('great', 49954): 0,
 ('for', 47042): 0,
 ('stay', 43553): 0,
 ('we', 42815): 0,
 ('of', 41167): 0,
 ('with', 37414): 0,
 ('place', 32685): 0,
 ('it', 32402): 0,
 ('location', 30508): 0,
 ('lovely', 30187): 0,
 ('you', 26730): 0,
 ('bristol', 26587): 0,
 ('clean', 25628): 0}

In [7]:
# to filter chinese words or get only english words
def get_english_words(documents):
    ''' filters out any non-english words'''
    english_words = [[ w for w in doc if re.match(r'[A-Z, a-z]', w)] for doc in documents]
    return english_words


fitered_english_words = get_english_words(tokenized_texts)
print(fitered_english_words[:2])

[['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus'], ['marcus', 'is', 'brilliant', 'warm', 'and', 'friendly', 'host', 'he', 'picked', 'us', 'up', 'from', 'the', 'railway', 'station', 'he', 'took', 'anne', 'to', 'the', 'doctor', 'and', 'drove', 'us', 'around', 'wherever', 'we', 'needed', 'to', 'go', 'in', 'bristol', 'and', 'dropped', 'us', 'back', 'at', 'the', 'railway', 'station', 'when', 'we', 'were', 'leaving', 'his', 'flat', 'is', 'very'

In [8]:
# see words and their frequency
words_freq=  get_words_frequency(fitered_english_words)
first10pairs = {k: words_freq[k] for k in sorted(words_freq.keys())[:20]}
first10pairs

{'aa': 4,
 'aaalso': 1,
 'aagain': 1,
 'aardman': 1,
 'aardvark': 1,
 'aaron': 3,
 'aasy': 1,
 'ab': 4,
 'aback': 2,
 'abandoned': 5,
 'abase': 1,
 'abb': 3,
 'abbey': 11,
 'abbeywood': 1,
 'abbi': 12,
 'abbiamo': 1,
 'abbie': 3,
 'abbots': 1,
 'abby': 2,
 'abc': 2}

In [9]:
# Check the most common or frequent words
most_common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
most_common_words[:20]   # first 20 words

[('and', 186280),
 ('the', 154077),
 ('to', 93356),
 ('was', 73952),
 ('in', 64651),
 ('is', 63608),
 ('very', 59606),
 ('great', 49954),
 ('for', 47042),
 ('stay', 43553),
 ('we', 42815),
 ('of', 41167),
 ('with', 37414),
 ('place', 32685),
 ('it', 32402),
 ('location', 30508),
 ('lovely', 30187),
 ('you', 26730),
 ('bristol', 26587),
 ('clean', 25628)]

In [10]:
# see the rare words 
rare_words = sorted(word_freq.items(), key=lambda x: x[1])   # NB: word_freq is a dictionary colloction
pprint(rare_words[:10])  #  top 10 rare words

[('chauffeur', 1),
 ('ireland', 1),
 ('foreward', 1),
 ('boatload', 1),
 ('markus', 1),
 ('cellular', 1),
 ('hovered', 1),
 ('saunter', 1),
 ('arranges', 1),
 ('availablity', 1)]


In [22]:
# Build the bigram & trigram models  [the model creates 2 or 3 wordy, most common occuring  phares]
bigram = gensim.models.Phrases(fitered_english_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[fitered_english_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[fitered_english_words[0]]])

['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus']


In [12]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
print('number of stop words:',len(stop_words))

# define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    data_out =[]
    for doc in texts:
            data_out.append([lemmatizer.lemmatize(word) for word in doc])
    return data_out 

number of stop words: 179


In [13]:
# remove the stopwords
data_words_nostops = remove_stopwords(fitered_english_words)
print("after removing stop words:",data_words_nostops[0])

# create bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
print("\nbigram data words:",data_words_bigrams[0])

#  lemmatize nouns
lemmatized_data = lemmatize_texts(data_words_bigrams)
print('\n\nAfter lemmatization:',lemmatized_data[:1])

after removing stop words: ['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'us', 'bus', 'stop', 'recommending', 'places', 'try', 'leaving', 'plenty', 'pastries', 'breakfast', 'items', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus']

bigram data words: ['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'us', 'bus', 'stop', 'recommending', 'places', 'try', 'leaving', 'plenty', 'pastries', 'breakfast', 'items', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus']


After lemmatization: [['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plen

In [14]:
# after removing stop words or pre-processing : check the most common and very the rare words
wf = get_words_frequency(lemmatized_data)
most_common_words = sorted(wf.items(), key=lambda x: x[1], reverse=True)
most_common_words[:20]   # top 20 most common words

[('great', 49961),
 ('stay', 43840),
 ('place', 35320),
 ('location', 30625),
 ('lovely', 30188),
 ('host', 28772),
 ('bristol', 26618),
 ('clean', 25632),
 ('room', 23979),
 ('would', 21626),
 ('really', 19666),
 ('house', 19255),
 ('nice', 18826),
 ('comfortable', 18213),
 ('recommend', 18204),
 ('good', 16698),
 ('flat', 16037),
 ('u', 13653),
 ('home', 13089),
 ('everything', 12855)]

In [15]:
# after removing stop words or pre-processing : check the rare words
wf = get_words_frequency(lemmatized_data)
rare_words = sorted(wf.items(), key=lambda x: x[1])
pprint(rare_words[:20])   # first 20 words

[('chauffeur', 1),
 ('ireland', 1),
 ('foreward', 1),
 ('markus', 1),
 ('cellular', 1),
 ('hovered', 1),
 ('saunter', 1),
 ('arranges', 1),
 ('availablity', 1),
 ('macus', 1),
 ('beet', 1),
 ('fraternity', 1),
 ('spotlesly', 1),
 ('guast', 1),
 ('vegfest', 1),
 ('deny', 1),
 ('wecolming', 1),
 ('strives', 1),
 ('morethe', 1),
 ('lovliness', 1)]


In [16]:
def remove_rare_words(texts,words_freq):
    '''removes the rare words'''
    words_data = [[w for w in doc if words_freq[w]>7] for doc in texts]
    return words_data


In [17]:
# remove the rare words from texts
words_data = remove_rare_words(lemmatized_data,wf)
pprint(words_data[:10])

[['stay',
  'marcus',
  'bristol',
  'fantastic',
  'every',
  'way',
  'great',
  'host',
  'picking',
  'u',
  'bus',
  'stop',
  'recommending',
  'place',
  'try',
  'leaving',
  'plenty',
  'pastry',
  'breakfast',
  'item',
  'enjoy',
  'morning',
  'flat',
  'modern',
  'bright',
  'clean',
  'spacious',
  'best',
  'right',
  'bristol',
  'lovely',
  'harbourside',
  'definitely',
  'stay',
  'next',
  'time',
  'bristol',
  'thanks',
  'marcus'],
 ['marcus',
  'brilliant',
  'warm',
  'friendly',
  'host',
  'picked',
  'u',
  'railway_station',
  'took',
  'anne',
  'doctor',
  'drove',
  'u',
  'around',
  'wherever',
  'needed',
  'go',
  'bristol',
  'dropped',
  'u',
  'back',
  'railway_station',
  'leaving',
  'flat',
  'modern',
  'comfortable',
  'clean',
  'well',
  'heated',
  'marcus',
  'provided',
  'u',
  'everything',
  'could',
  'wish',
  'wish',
  'could',
  'stayed',
  'longer'],
 ['mum',
  'angela',
  'stayed',
  'marcus',
  'amazing',
  'apartment',
  'tw

In [18]:
# create dictionary
id2word = corpora.Dictionary(words_data)
print("number of keys:",len(id2word.keys()))
# view the first word of the dictionary
print(id2word[0])   # see word with key =0

# create corpus
corpus_texts= words_data
print(corpus_texts[:3])

# bag of words or vector spaces
corpus = [id2word.doc2bow(text) for text in corpus_texts]
print(corpus[:2])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

number of keys: 5785
best
[['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plenty', 'pastry', 'breakfast', 'item', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus'], ['marcus', 'brilliant', 'warm', 'friendly', 'host', 'picked', 'u', 'railway_station', 'took', 'anne', 'doctor', 'drove', 'u', 'around', 'wherever', 'needed', 'go', 'bristol', 'dropped', 'u', 'back', 'railway_station', 'leaving', 'flat', 'modern', 'comfortable', 'clean', 'well', 'heated', 'marcus', 'provided', 'u', 'everything', 'could', 'wish', 'wish', 'could', 'stayed', 'longer'], ['mum', 'angela', 'stayed', 'marcus', 'amazing', 'apartment', 'two', 'week', 'august', 'relocating', 'bristol', 'lovely', 'experience', 'host', 'apartment', 'extremely', 'confortable', 'located', 'nice', '

[[('best', 1),
  ('breakfast', 1),
  ('bright', 1),
  ('bristol', 3),
  ('bus', 1),
  ('clean', 1),
  ('definitely', 1),
  ('enjoy', 1),
  ('every', 1),
  ('fantastic', 1),
  ('flat', 1),
  ('great', 1),
  ('harbourside', 1),
  ('host', 1),
  ('item', 1),
  ('leaving', 1),
  ('lovely', 1),
  ('marcus', 2),
  ('modern', 1),
  ('morning', 1),
  ('next', 1),
  ('pastry', 1),
  ('picking', 1),
  ('place', 1),
  ('plenty', 1),
  ('recommending', 1),
  ('right', 1),
  ('spacious', 1),
  ('stay', 2),
  ('stop', 1),
  ('thanks', 1),
  ('time', 1),
  ('try', 1),
  ('u', 1),
  ('way', 1)]]

How does LDA work or converge?
`
For each document d, compute P( topic t | document d ) := proportion of words in document d that are assigned to topic t
For each topic t, P( word w | topic t ) := proportion of assignments to topic t that come from word w (across all documents)
For each word w, reassign topic t’, where we choose topic t’ with probability P( topic t’ | word w ) = P( topic t’ | document d ) * P( word w | topic t’ )`

In [19]:
import time
start_time = time.time()
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

done_time = time.time()

pprint(" %.3f secs" % (done_time - start_time))

' 299.855 secs'


In [20]:
# Print the 10 keyword in topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.249*"nice" + 0.219*"good" + 0.062*"value" + 0.043*"shower" + '
  '0.040*"people" + 0.036*"money" + 0.012*"window" + 0.010*"service" + '
  '0.010*"drink" + 0.010*"load"'),
 (1,
  '0.077*"bit" + 0.044*"milk" + 0.040*"booking" + 0.030*"bread" + '
  '0.030*"heating" + 0.025*"another" + 0.023*"fridge" + 0.022*"outstanding" + '
  '0.021*"inside" + 0.020*"others"'),
 (2,
  '0.104*"clean" + 0.094*"lovely" + 0.072*"really" + 0.060*"room" + '
  '0.058*"house" + 0.056*"comfortable" + 0.037*"bed" + 0.034*"home" + '
  '0.030*"friendly" + 0.026*"communication"'),
 (3,
  '0.089*"coffee" + 0.071*"couple" + 0.063*"tea" + 0.039*"especially" + '
  '0.033*"group" + 0.033*"sofa" + 0.024*"decor" + 0.024*"wanted" + '
  '0.023*"basic" + 0.023*"slept"'),
 (4,
  '0.049*"well" + 0.045*"easy" + 0.041*"city" + 0.040*"space" + 0.038*"centre" '
  '+ 0.032*"area" + 0.030*"close" + 0.030*"walk" + 0.028*"quiet" + '
  '0.027*"check"'),
 (5,
  '0.054*"u" + 0.035*"also" + 0.032*"kitchen" + 0.024*"could" + 0.024*

` NB: next steps (a)further preprocesing texts,like removing the firsrt few most common words,(b)interpretation of the topics,(c)visualizing the topics clusters, and (d) documents ranking...`
--------------------
`references`: gensim documentation 