In [1]:
import nltk 
# nltk.download('all')   # no need to download all packages, uncomment it for 1st time running code
from nltk.stem import WordNetLemmatizer
import re    # regular expression
import numpy as np
import pandas as pd
from pprint import pprint  # prettier print
import gensim       # Gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

# disable warnings
import warnings
warnings.filterwarnings(action='ignore')

  from collections import Mapping


In [2]:
# load the save the english comments corpus into a pickle file as a list
pickle_in = open("english_comments.pickle","rb")
english_comments = pickle.load(pickle_in)    # loads the list of comments
pprint(english_comments[:1])

['Our stay with Marcus in Bristol was fantastic in every way! He was a great '
 'host - picking us up at the bus stop, recommending places to try, leaving '
 'plenty of pastries and other breakfast items to enjoy in the morning. The '
 'flat itself was modern, bright, clean and spacious - and best of all, right '
 "on Bristol's lovely harbourside. We will definitely stay again next time "
 "we're in Bristol - thanks again Marcus!"]


In [3]:
# put the list into dataframe to view the nature of texts 
df = pd.DataFrame(english_comments,columns=['comments'])
pprint(df.head(5))
print('\n\n')
pprint(df.info())

                                            comments
0  Our stay with Marcus in Bristol was fantastic ...
1  Marcus is a brilliant, warm and friendly host....
2  My mum Angela and I have stayed at Marcus' ama...
3  Marcus was an exceptional host. I only stayed ...
4  Marcus was welcoming, easy going and very help...



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96419 entries, 0 to 96418
Data columns (total 1 columns):
comments    96419 non-null object
dtypes: object(1)
memory usage: 753.4+ KB
None


In [4]:
# tokenize sentences and remove panctuations and emojis
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [5]:
tokenized_texts = list(sent_to_words(english_comments))
print(tokenized_texts[:2])


[['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus'], ['marcus', 'is', 'brilliant', 'warm', 'and', 'friendly', 'host', 'he', 'picked', 'us', 'up', 'from', 'the', 'railway', 'station', 'he', 'took', 'anne', 'to', 'the', 'doctor', 'and', 'drove', 'us', 'around', 'wherever', 'we', 'needed', 'to', 'go', 'in', 'bristol', 'and', 'dropped', 'us', 'back', 'at', 'the', 'railway', 'station', 'when', 'we', 'were', 'leaving', 'his', 'flat', 'is', 'very'

In [6]:
# see the most common and most rare words
from collections import  Counter    # dictionary collection
def get_words_frequency(texts):
    counter = Counter()
    for  doc in texts:
        for word in doc:
            counter[word]+=1
    return counter

word_freq = get_words_frequency(tokenized_texts)
pprint(word_freq) # @ the end of the printed texts, still chinise words in there.

Counter({'and': 186280,
         'the': 154077,
         'to': 93356,
         'was': 73952,
         'in': 64651,
         'is': 63608,
         'very': 59606,
         'great': 49954,
         'for': 47042,
         'stay': 43553,
         'we': 42815,
         'of': 41167,
         'with': 37414,
         'place': 32685,
         'it': 32402,
         'location': 30508,
         'lovely': 30187,
         'you': 26730,
         'bristol': 26587,
         'clean': 25628,
         'room': 22558,
         'would': 21626,
         'host': 20727,
         'really': 19666,
         'house': 19121,
         'nice': 18825,
         'had': 18530,
         'comfortable': 18213,
         'recommend': 18204,
         'at': 17680,
         'as': 17458,
         'good': 16679,
         'this': 16439,
         'were': 16193,
         'flat': 15887,
         'again': 14698,
         'our': 14624,
         'from': 14350,
         'on': 13921,
         'but': 13856,
         'us': 13653,
         'so'

         'suresh': 192,
         'important': 191,
         'groups': 191,
         'getaway': 191,
         'inviting': 190,
         'below': 190,
         'regarding': 190,
         'larger': 190,
         'job': 189,
         'luxurious': 189,
         'connected': 189,
         'world': 189,
         'birthday': 189,
         'arrangements': 189,
         'hassle': 188,
         'opportunity': 188,
         'bedminster': 188,
         'tour': 188,
         'called': 188,
         'walls': 188,
         'airbristol': 188,
         'amy': 188,
         'wants': 187,
         'claire': 187,
         'comes': 186,
         'grateful': 186,
         'instantly': 186,
         'decoration': 186,
         'level': 186,
         'zoo': 186,
         'seeing': 185,
         'appreciate': 185,
         'easton': 185,
         'coach': 184,
         'tasty': 184,
         'richard': 184,
         'haven': 184,
         'parents': 184,
         'supplies': 184,
         'daniel': 184,
       

         'cookies': 60,
         'hop': 60,
         'packed': 60,
         'birds': 60,
         'rosemary': 60,
         'camping': 60,
         'deck': 60,
         'fittings': 60,
         'riverside': 59,
         'definetly': 59,
         'assistance': 59,
         'unwind': 59,
         'tray': 59,
         'deep': 59,
         'sainsbury': 59,
         'engaging': 59,
         'bang': 59,
         'dont': 59,
         'beware': 59,
         'quieter': 59,
         'includes': 59,
         'thumbs': 59,
         'messaged': 59,
         'state': 59,
         'elsie': 59,
         'matthew': 59,
         'shut': 59,
         'natalie': 59,
         'haidi': 59,
         'adrian': 59,
         'tamar': 59,
         'dressing': 58,
         'ran': 58,
         'promised': 58,
         'seriously': 58,
         'lovey': 58,
         'ways': 58,
         'pot': 58,
         'odd': 58,
         'passing': 58,
         'helps': 58,
         'hideaway': 58,
         'guides': 58,
      

         'november': 33,
         'build': 33,
         'conrad': 33,
         'janet': 33,
         'rhi': 33,
         'skylight': 32,
         'row': 32,
         'speed': 32,
         'delightfully': 32,
         'immensely': 32,
         'tranquility': 32,
         'entertained': 32,
         'entering': 32,
         'account': 32,
         'spreads': 32,
         'corridor': 32,
         'dairy': 32,
         'women': 32,
         'homes': 32,
         'suggesting': 32,
         'abundance': 32,
         'reality': 32,
         'furnishing': 32,
         'scene': 32,
         'cuddles': 32,
         'fans': 32,
         'sooo': 32,
         'wonder': 32,
         'talks': 32,
         'ace': 32,
         'story': 32,
         'raised': 32,
         'wander': 32,
         'nightmare': 32,
         'hence': 32,
         'positives': 32,
         'ice': 32,
         'minimalist': 32,
         'outlook': 32,
         'cornwall': 32,
         'shutters': 32,
         'sign': 32,
     

         'count': 22,
         'dietary': 22,
         'laugh': 22,
         'benji': 22,
         'sweetest': 22,
         'animal': 22,
         'posted': 22,
         'musical': 22,
         'remarkably': 22,
         'massively': 22,
         'silence': 22,
         'pure': 22,
         'band': 22,
         'singles': 22,
         'stair': 22,
         'someday': 22,
         'impersonal': 22,
         'ie': 22,
         'windy': 22,
         'exam': 22,
         'shampoos': 22,
         'bedsheets': 22,
         'pace': 22,
         'handled': 22,
         'inexpensive': 22,
         'firstly': 22,
         'royal': 22,
         'edges': 22,
         'finishes': 22,
         'chip': 22,
         'whose': 22,
         'starts': 22,
         'trail': 22,
         'fellow': 22,
         'adding': 22,
         'cushions': 22,
         'loverly': 22,
         'calling': 22,
         'giant': 22,
         'project': 22,
         'equipments': 22,
         'workshop': 22,
         'covid

         'toothbrush': 13,
         'annual': 13,
         'nibbles': 13,
         'remove': 13,
         'characters': 13,
         'logs': 13,
         'genial': 13,
         'ridiculous': 13,
         'filling': 13,
         'load': 13,
         'tended': 13,
         'span': 13,
         'hearts': 13,
         'nat': 13,
         'boots': 13,
         'deserve': 13,
         'locating': 13,
         'beats': 13,
         'mature': 13,
         'alarms': 13,
         'nye': 13,
         'today': 13,
         'lend': 13,
         'guarantee': 13,
         'amd': 13,
         'meets': 13,
         'rated': 13,
         'sundays': 13,
         'tardis': 13,
         'outstandingly': 13,
         'tries': 13,
         'mails': 13,
         'forced': 13,
         'cornflakes': 13,
         'evident': 13,
         'curfew': 13,
         'backs': 13,
         'scrambled': 13,
         'vey': 13,
         'et': 13,
         'respite': 13,
         'electrical': 13,
         'btw': 13,
     

         'proofing': 9,
         'ants': 9,
         'jonny': 9,
         'stain': 9,
         'error': 9,
         'archie': 9,
         'neutral': 9,
         'rufus': 9,
         'premier': 9,
         'kylie': 9,
         'unresponsive': 9,
         'erica': 9,
         'litter': 9,
         'louises': 9,
         'layer': 9,
         'hottest': 9,
         'pod': 9,
         'konrad': 9,
         'milo': 9,
         'francesca': 9,
         'juniper': 9,
         'melany': 9,
         'kamila': 9,
         'kamar': 9,
         'everton': 9,
         'relocation': 8,
         'steam': 8,
         'ers': 8,
         'wheelchair': 8,
         'vince': 8,
         'banter': 8,
         'unaware': 8,
         'roofs': 8,
         'architecturally': 8,
         'enchanting': 8,
         'satisfy': 8,
         'rescue': 8,
         'dressed': 8,
         'wardrobes': 8,
         'shocked': 8,
         'peak': 8,
         'teaching': 8,
         'anonymous': 8,
         'socialize': 8,
  

         'cosiness': 6,
         'creepy': 6,
         'occurred': 6,
         'handbook': 6,
         'boarding': 6,
         'hanna': 6,
         'licence': 6,
         'springy': 6,
         'jog': 6,
         'nail': 6,
         'flushed': 6,
         'reliably': 6,
         'dip': 6,
         'camera': 6,
         'palace': 6,
         'hygge': 6,
         'beast': 6,
         'bosco': 6,
         'raising': 6,
         'abd': 6,
         'competition': 6,
         'slot': 6,
         'agile': 6,
         'virus': 6,
         'bloody': 6,
         'insane': 6,
         'calum': 6,
         'snoring': 6,
         'phrase': 6,
         'puss': 6,
         'vfm': 6,
         'gastro': 6,
         'protection': 6,
         'activated': 6,
         'electrics': 6,
         'nut': 6,
         'straightaway': 6,
         'soukitchen': 6,
         'competitive': 6,
         'interrupted': 6,
         'stole': 6,
         'beutiful': 6,
         'matts': 6,
         'sceptical': 6,
       

         'grease': 5,
         'grime': 5,
         'fumbling': 5,
         'flimsy': 5,
         'moderate': 5,
         'carmel': 5,
         'keypads': 5,
         'ami': 5,
         'narzanin': 5,
         'ahmed': 5,
         'personalized': 5,
         'kates': 5,
         'horses': 5,
         'denise': 5,
         'mervyn': 5,
         'rowan': 5,
         'afriqa': 5,
         'rihanna': 5,
         'bobby': 5,
         'andreea': 5,
         'livingstone': 5,
         'ion': 5,
         'biggles': 5,
         'maryam': 5,
         'paris': 4,
         'fails': 4,
         'facts': 4,
         'emphasise': 4,
         'strolled': 4,
         'unknown': 4,
         'upscale': 4,
         'brushed': 4,
         'figgy': 4,
         'falls': 4,
         'bien': 4,
         'distinct': 4,
         'ya': 4,
         'transportations': 4,
         'perfume': 4,
         'burn': 4,
         'statement': 4,
         'approached': 4,
         'sweetly': 4,
         'brains': 4,
       

         'tu': 4,
         'pinterest': 4,
         'stash': 4,
         'teachers': 4,
         'commotion': 4,
         'core': 4,
         'circomedia': 4,
         'canine': 4,
         'feminine': 4,
         'darcy': 4,
         'huuuuge': 4,
         'hippy': 4,
         'completly': 4,
         'marty': 4,
         'wearing': 4,
         'chromecast': 4,
         'berkley': 4,
         'fluff': 4,
         'petes': 4,
         'useless': 4,
         'coincided': 4,
         'trickle': 4,
         'atop': 4,
         'heights': 4,
         'minds': 4,
         'intending': 4,
         'ah': 4,
         'theory': 4,
         'claires': 4,
         'spotting': 4,
         'artifacts': 4,
         'focused': 4,
         'snazzy': 4,
         'lavish': 4,
         'civilised': 4,
         'builder': 4,
         'vanity': 4,
         'pipal': 4,
         'thee': 4,
         'witnessed': 4,
         'yobike': 4,
         'environmental': 4,
         'backpack': 4,
         'unwinding'

         'teapots': 3,
         'souls': 3,
         'hospitability': 3,
         'riverfront': 3,
         'renata': 3,
         'blueberry': 3,
         'greated': 3,
         'duties': 3,
         'lifetime': 3,
         'ferries': 3,
         'anticipate': 3,
         'bistol': 3,
         'enticing': 3,
         'ganoush': 3,
         'fondly': 3,
         'egyptian': 3,
         'completes': 3,
         'birthdays': 3,
         'sodden': 3,
         'interiour': 3,
         'fringes': 3,
         'tulips': 3,
         'hyde': 3,
         'cube': 3,
         'implies': 3,
         'goodwill': 3,
         'haircut': 3,
         'candies': 3,
         'naughty': 3,
         'imaginative': 3,
         'househunting': 3,
         'adore': 3,
         'raphael': 3,
         'thereafter': 3,
         'expertise': 3,
         'derek': 3,
         'fiancee': 3,
         'malone': 3,
         'thundering': 3,
         'lands': 3,
         'maze': 3,
         'heel': 3,
         'buddy': 3,

         'parlour': 3,
         'reveal': 3,
         'dearest': 3,
         'yea': 3,
         'salubrious': 3,
         'kfc': 3,
         'nt': 3,
         'prim': 3,
         'ammenties': 3,
         'niceties': 3,
         'ugly': 3,
         'infrequent': 3,
         'amused': 3,
         'electrician': 3,
         'documented': 3,
         'staybeyond': 3,
         'swept': 3,
         'deluxe': 3,
         'amandas': 3,
         'unsatisfactory': 3,
         'downfalls': 3,
         'exmoor': 3,
         'salva': 3,
         'gotta': 3,
         'refusing': 3,
         'ensuites': 3,
         'tattoo': 3,
         'tl': 3,
         'unplugged': 3,
         'forniture': 3,
         'roly': 3,
         'portal': 3,
         'sponges': 3,
         'penalty': 3,
         'reads': 3,
         'capable': 3,
         'beese': 3,
         'ain': 3,
         'nickis': 3,
         'sams': 3,
         'patrons': 3,
         'sparkles': 3,
         'mokoko': 3,
         'furthest': 3,
    

         'consumption': 2,
         'nighter': 2,
         'modify': 2,
         'allocation': 2,
         'nightstand': 2,
         'affairs': 2,
         'busing': 2,
         'replica': 2,
         'withdrawn': 2,
         'hobby': 2,
         'paella': 2,
         'bathroon': 2,
         'obedient': 2,
         'spaciously': 2,
         'communities': 2,
         'fest': 2,
         'lately': 2,
         'periodically': 2,
         'becuase': 2,
         'wholefood': 2,
         'owning': 2,
         'swan': 2,
         'congress': 2,
         'lugging': 2,
         'choir': 2,
         'somethig': 2,
         'ladders': 2,
         'reeeally': 2,
         'lenient': 2,
         'senior': 2,
         'bussines': 2,
         'approval': 2,
         'anodyne': 2,
         'imbued': 2,
         'kindred': 2,
         'insignificant': 2,
         'tempur': 2,
         'ipods': 2,
         'frantic': 2,
         'tommy': 2,
         'mtr': 2,
         'feautures': 2,
         'riders': 

         'retrieval': 2,
         'wel': 2,
         'models': 2,
         'waa': 2,
         'necessaries': 2,
         'fluctuated': 2,
         'footwear': 2,
         'marching': 2,
         'versatile': 2,
         'boasting': 2,
         'sock': 2,
         'doubling': 2,
         'placr': 2,
         'redevelopment': 2,
         'battling': 2,
         'stunned': 2,
         'bocobar': 2,
         'sweltering': 2,
         'burgeoning': 2,
         'chesterfield': 2,
         'dazzling': 2,
         'conexion': 2,
         'distinctive': 2,
         'comfie': 2,
         'totalmente': 2,
         'tracks': 2,
         'stamp': 2,
         'thousands': 2,
         'mesmerising': 2,
         'capsule': 2,
         'gay': 2,
         'josephine': 2,
         'handly': 2,
         'baldwin': 2,
         'radcliffe': 2,
         'increasing': 2,
         'swiss': 2,
         'convertible': 2,
         'pose': 2,
         'fascination': 2,
         'blessings': 2,
         'innocent':

         'glue': 2,
         'suprised': 2,
         'ceramic': 2,
         'illumination': 2,
         'impractical': 2,
         'elbow': 2,
         'heck': 2,
         'fav': 2,
         'arcades': 2,
         'cyrus': 2,
         'belgium': 2,
         'ther': 2,
         'noisiest': 2,
         'prewarned': 2,
         'entranceway': 2,
         'dials': 2,
         'vertical': 2,
         'passageways': 2,
         'caffes': 2,
         'loudest': 2,
         'advertises': 2,
         'anns': 2,
         'specs': 2,
         'fort': 2,
         'housekeeping': 2,
         'tradewinds': 2,
         'configured': 2,
         'appreciating': 2,
         'blackboy': 2,
         'understairs': 2,
         'repairman': 2,
         'glove': 2,
         'harbouside': 2,
         'optimum': 2,
         'disappoints': 2,
         'flaking': 2,
         'rattled': 2,
         'openable': 2,
         'wagamama': 2,
         'locationgreat': 2,
         'rearranging': 2,
         'insist': 2

         'thunder': 2,
         'muchas': 2,
         'ngaios': 2,
         'aristotle': 2,
         'ni': 2,
         'licensed': 2,
         'awfully': 2,
         'dreary': 2,
         'incidentals': 2,
         'exits': 2,
         'manually': 2,
         'rejected': 2,
         'mothers': 2,
         'euros': 2,
         'unnerving': 2,
         'hmo': 2,
         'mazing': 2,
         'mangia': 2,
         'orchards': 2,
         'accuracy': 2,
         'makeover': 2,
         'overstate': 2,
         'ashers': 2,
         'pitta': 2,
         'susanna': 2,
         'suzannas': 2,
         'overnighter': 2,
         'woof': 2,
         'vanilla': 2,
         'pillowcase': 2,
         'mildly': 2,
         'phillipp': 2,
         'zaks': 2,
         'episodes': 2,
         'sensors': 2,
         'respondes': 2,
         'supporters': 2,
         'mishaps': 2,
         'straightener': 2,
         'heartly': 2,
         'excitable': 2,
         'convience': 2,
         'panes': 2,
 

         'scissored': 1,
         'supplementary': 1,
         'java': 1,
         'makoko': 1,
         'charateristic': 1,
         'stomach': 1,
         'gewn': 1,
         'pawel': 1,
         'suffices': 1,
         'insured': 1,
         'papito': 1,
         'comportable': 1,
         'pootling': 1,
         'honeysuckles': 1,
         'singleroom': 1,
         'leaning': 1,
         'motivated': 1,
         'opting': 1,
         'faciliaties': 1,
         'smala': 1,
         'famille': 1,
         'witz': 1,
         'backgammon': 1,
         'starving': 1,
         'rusic': 1,
         'marlene': 1,
         'sandras': 1,
         'beatrice': 1,
         'ggod': 1,
         'pinpong': 1,
         'germ': 1,
         'accomation': 1,
         'hoat': 1,
         'consist': 1,
         'tennins': 1,
         'paitings': 1,
         'amplifiers': 1,
         'booth': 1,
         'tidbits': 1,
         'passions': 1,
         'fender': 1,
         'rhodes': 1,
         'amp': 1,

         'busroutes': 1,
         'samba': 1,
         'emanates': 1,
         'helpend': 1,
         'eddies': 1,
         'mades': 1,
         'stalla': 1,
         'easyhighly': 1,
         'peoble': 1,
         'comversation': 1,
         'celebrety': 1,
         'sweetmarket': 1,
         'troves': 1,
         'axcess': 1,
         'nomad': 1,
         'kirtan': 1,
         'opportune': 1,
         'kahleen': 1,
         'juste': 1,
         'ephemera': 1,
         'herba': 1,
         'meditating': 1,
         'destined': 1,
         'cutely': 1,
         'oatly': 1,
         'thesmartest': 1,
         'mytime': 1,
         'huzzle': 1,
         'graccie': 1,
         'leafiest': 1,
         'overllooking': 1,
         'xix': 1,
         'emerge': 1,
         'sma': 1,
         'actuallt': 1,
         'giddy': 1,
         'incosy': 1,
         'realizar': 1,
         'hice': 1,
         'talling': 1,
         'recommodation': 1,
         'helos': 1,
         'upp': 1,
         'f

         'inammered': 1,
         'overviewing': 1,
         'cabing': 1,
         'instinctive': 1,
         'sip': 1,
         'craziness': 1,
         'overjoyed': 1,
         'contrived': 1,
         'chaiselone': 1,
         'morocco': 1,
         'deciduous': 1,
         'spidernets': 1,
         'christen': 1,
         'marketa': 1,
         'similarities': 1,
         'cs': 1,
         'conceal': 1,
         'wouldmn': 1,
         'mee': 1,
         'manx': 1,
         'dragonflies': 1,
         'butterflies': 1,
         'eastons': 1,
         'compose': 1,
         'enoug': 1,
         'watertight': 1,
         'bramley': 1,
         'recliner': 1,
         'siblings': 1,
         'hospitaly': 1,
         'barbecued': 1,
         'shawls': 1,
         'stourhead': 1,
         'showerquick': 1,
         'compftable': 1,
         'competes': 1,
         'uninspiring': 1,
         'cultery': 1,
         'nw': 1,
         'adorableand': 1,
         'tomcat': 1,
         'softie':

         'pd': 1,
         'bettergreat': 1,
         'situatedgreat': 1,
         'butprue': 1,
         'personsimon': 1,
         'discotheques': 1,
         'hamsters': 1,
         'wildy': 1,
         'stressbuster': 1,
         'estuary': 1,
         'marians': 1,
         'ambients': 1,
         'bentleighs': 1,
         'instructor': 1,
         'padded': 1,
         'savings': 1,
         'summarised': 1,
         'bullet': 1,
         'kebabs': 1,
         'succinct': 1,
         'quiets': 1,
         'stunner': 1,
         'edt': 1,
         'fantabulous': 1,
         'scaffoldings': 1,
         'antony': 1,
         'decluttered': 1,
         'perfecrly': 1,
         'recomened': 1,
         'scarletts': 1,
         'reccomendable': 1,
         'allotted': 1,
         'merit': 1,
         'kitchenettes': 1,
         'cental': 1,
         'allowance': 1,
         '房间的设施很棒': 1,
         '装修也很有格调': 1,
         '虽然中间发生了一些意外': 1,
         '对于赔偿的金额发生了一些争执': 1,
         '对发生的意外我表示

         'humidifier': 1,
         'aggravate': 1,
         'gardenswhich': 1,
         'exetra': 1,
         'marvelling': 1,
         'otira': 1,
         'oversights': 1,
         'mistakenly': 1,
         'djs': 1,
         'camdem': 1,
         'tracked': 1,
         'babthroom': 1,
         'chucking': 1,
         'defintatley': 1,
         'provocative': 1,
         'regretfully': 1,
         'inefficient': 1,
         'intoxicated': 1,
         'layouts': 1,
         'youu': 1,
         'clo': 1,
         'fanny': 1,
         'trinas': 1,
         'spontinious': 1,
         'commonly': 1,
         'scaredy': 1,
         'axcellent': 1,
         'essay': 1,
         'blames': 1,
         'output': 1,
         'kleenex': 1,
         'amies': 1,
         'veronika': 1,
         'sympathic': 1,
         'experiencia': 1,
         'confortavel': 1,
         'yous': 1,
         'centresharon': 1,
         'fraud': 1,
         'cozied': 1,
         'disciplined': 1,
         'karina':

         'etcsome': 1,
         'gins': 1,
         'protest': 1,
         'brainier': 1,
         'odors': 1,
         'reget': 1,
         'geneology': 1,
         'orphanages': 1,
         'indonesian': 1,
         'sensing': 1,
         'cleat': 1,
         'outreach': 1,
         'bedder': 1,
         'sachs': 1,
         'aspected': 1,
         'coving': 1,
         'jiggle': 1,
         'heeded': 1,
         'mazda': 1,
         'cx': 1,
         'accessorized': 1,
         'honda': 1,
         'lound': 1,
         'peugeot': 1,
         'widows': 1,
         'ercommend': 1,
         'vietnamese': 1,
         'coffeebar': 1,
         'comftorble': 1,
         'conferrable': 1,
         'fennela': 1,
         'woderful': 1,
         'fenell': 1,
         'parle': 1,
         'aussi': 1,
         'francais': 1,
         'ce': 1,
         'qui': 1,
         'les': 1,
         'frenchies': 1,
         'acomidating': 1,
         'rethinking': 1,
         'ely': 1,
         'sice': 1,

         'damped': 1,
         'elana': 1,
         'whining': 1,
         'chocolte': 1,
         'everythinng': 1,
         'shortstay': 1,
         'roomvery': 1,
         'acording': 1,
         'exerienced': 1,
         'na': 1,
         'explainning': 1,
         'fiendly': 1,
         'alternately': 1,
         'digitally': 1,
         'stimulant': 1,
         'antoinio': 1,
         'blokes': 1,
         'locatioon': 1,
         'antonios': 1,
         'shopmobility': 1,
         'collapses': 1,
         'loudness': 1,
         'meditations': 1,
         'yogcon': 1,
         'headspace': 1,
         'marley': 1,
         'straigh': 1,
         'revelant': 1,
         'slaming': 1,
         'busted': 1,
         'dedicate': 1,
         'manege': 1,
         'abundacne': 1,
         'yummie': 1,
         'organisational': 1,
         'granddaughters': 1,
         'wasperfect': 1,
         'perfectlly': 1,
         'neightborough': 1,
         'yhe': 1,
         'ridge': 1,
     

         'centrical': 1,
         'fuzzy': 1,
         'tilll': 1,
         'takir': 1,
         'insuite': 1,
         'keepers': 1,
         'gmt': 1,
         'silviu': 1,
         'tarim': 1,
         'approximative': 1,
         'noize': 1,
         'taric': 1,
         'adventuring': 1,
         'itwe': 1,
         'giuolio': 1,
         'heartlovely': 1,
         'bedexcellent': 1,
         'socketsall': 1,
         'visitins': 1,
         'keda': 1,
         'centrethank': 1,
         'resposenes': 1,
         'britian': 1,
         'lax': 1,
         'defetently': 1,
         'perishable': 1,
         'crumbled': 1,
         'tens': 1,
         'midges': 1,
         'absorb': 1,
         'inspect': 1,
         'xperiemce': 1,
         'eleanor': 1,
         'towls': 1,
         'weekedn': 1,
         'cocked': 1,
         'streight': 1,
         'postcard': 1,
         'hailed': 1,
         'whent': 1,
         'rafals': 1,
         'cosyplace': 1,
         'warmlovely': 1,
  

         'mount': 1,
         'cushioned': 1,
         'beisrol': 1,
         'smacked': 1,
         'ospitality': 1,
         'coty': 1,
         'fountains': 1,
         'comfarteble': 1,
         'helana': 1,
         'atention': 1,
         'ranking': 1,
         'hostsvery': 1,
         'crofthighly': 1,
         'bernarda': 1,
         'myshka': 1,
         'batheoom': 1,
         'vacancies': 1,
         'speeded': 1,
         'advantageous': 1,
         'esay': 1,
         'eary': 1,
         'plasticy': 1,
         'onboard': 1,
         'autistic': 1,
         'proceworthy': 1,
         'smoothed': 1,
         'quitely': 1,
         'performed': 1,
         'wqs': 1,
         'johnathans': 1,
         'jonathons': 1,
         'vibratoons': 1,
         'spiraling': 1,
         'asado': 1,
         'clothed': 1,
         'unluckely': 1,
         'sthe': 1,
         'stressing': 1,
         'planing': 1,
         'replenish': 1,
         'aibnb': 1,
         'mediaeval': 1,
    

         'eyemask': 1,
         'streetlight': 1,
         'litteraly': 1,
         'recreational': 1,
         'stingy': 1,
         'helicopter': 1,
         'whiniest': 1,
         'broadcast': 1,
         'intolerable': 1,
         'wink': 1,
         'sh': 1,
         'explanatory': 1,
         'thanksmarek': 1,
         'darkened': 1,
         'slicker': 1,
         'laundrymat': 1,
         'touchpads': 1,
         'superquicly': 1,
         'tamars': 1,
         'turtle': 1,
         'reland': 1,
         'aardman': 1,
         'placereally': 1,
         'acknowledge': 1,
         '强烈推荐': 1,
         '非常漂亮和干净的房子': 1,
         '离市中心也很近': 1,
         '可以自助check': 1,
         'host本人也非常亲和': 1,
         'olivers': 1,
         'sebastian': 1,
         'spree': 1,
         'sboping': 1,
         'proops': 1,
         'outrageous': 1,
         'moaner': 1,
         'minimarket': 1,
         'meadsthe': 1,
         'nuce': 1,
         'omlet': 1,
         'envelopes': 1,
         'stat

         'easykitchen': 1,
         'equippedthe': 1,
         'hsham': 1,
         'willling': 1,
         'horsham': 1,
         'phillips': 1,
         'everett': 1,
         'accountants': 1,
         'multilingual': 1,
         'fantastico': 1,
         'nices': 1,
         'teller': 1,
         'moder': 1,
         'freaky': 1,
         'coathangers': 1,
         'vll': 1,
         'magnet': 1,
         'skate': 1,
         'thorougly': 1,
         'walkig': 1,
         'endeavor': 1,
         'nourished': 1,
         'creamy': 1,
         'reaffirming': 1,
         'notify': 1,
         'assumptions': 1,
         'instantaneous': 1,
         'undertand': 1,
         'ess': 1,
         'deffonatly': 1,
         'washingmachine': 1,
         'feal': 1,
         'dietry': 1,
         'sones': 1,
         'wolverhampton': 1,
         'pierced': 1,
         'eardrum': 1,
         'clufton': 1,
         'ens': 1,
         'carparking': 1,
         'straightening': 1,
         'circuit

         'bolts': 1,
         'settleinto': 1,
         'attactive': 1,
         'wend': 1,
         'wetting': 1,
         'mateo': 1,
         'arriave': 1,
         'friendsplenty': 1,
         'unfortunatley': 1,
         'lasagne': 1,
         'thave': 1,
         'jenga': 1,
         'alleyways': 1,
         'creaks': 1,
         'mercedes': 1,
         'benz': 1,
         'passengers': 1,
         'resorted': 1,
         'hella': 1,
         'despites': 1,
         'ritz': 1,
         'spoiler': 1,
         'hoste': 1,
         'donated': 1,
         'profits': 1,
         'leukaemia': 1,
         'hostdefinitely': 1,
         'wand': 1,
         'placements': 1,
         'carer': 1,
         'lightbox': 1,
         'spotlessvery': 1,
         'citythe': 1,
         'frosted': 1,
         'ecosystem': 1,
         'carlo': 1,
         'jacobo': 1,
         'squeeky': 1,
         'chinaware': 1,
         'butchershop': 1,
         'ada': 1,
         'yew': 1,
         'cows': 1,
 

In [7]:
# to filter chinese words or get only english words
def get_english_words(documents):
    ''' filters out any non-english words'''
    english_words = [[ w for w in doc if re.match(r'[A-Z, a-z]', w)] for doc in documents]
    return english_words


fitered_english_words = get_english_words(tokenized_texts)
print(fitered_english_words[:2])

[['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus'], ['marcus', 'is', 'brilliant', 'warm', 'and', 'friendly', 'host', 'he', 'picked', 'us', 'up', 'from', 'the', 'railway', 'station', 'he', 'took', 'anne', 'to', 'the', 'doctor', 'and', 'drove', 'us', 'around', 'wherever', 'we', 'needed', 'to', 'go', 'in', 'bristol', 'and', 'dropped', 'us', 'back', 'at', 'the', 'railway', 'station', 'when', 'we', 'were', 'leaving', 'his', 'flat', 'is', 'very'

In [8]:
# see words and their frequency
words_freq=  get_words_frequency(fitered_english_words)
pprint(sorted(words_freq)) #  chinise words are gone.

['aa',
 'aaalso',
 'aagain',
 'aardman',
 'aardvark',
 'aaron',
 'aasy',
 'ab',
 'aback',
 'abandoned',
 'abase',
 'abb',
 'abbey',
 'abbeywood',
 'abbi',
 'abbiamo',
 'abbie',
 'abbots',
 'abby',
 'abc',
 'abd',
 'abdul',
 'abe',
 'aberdeen',
 'abhishek',
 'abi',
 'abid',
 'abide',
 'abilities',
 'ability',
 'abis',
 'abit',
 'able',
 'abled',
 'ablutions',
 'abnb',
 'abo',
 'aboce',
 'abode',
 'abodolutely',
 'abosultely',
 'abound',
 'about',
 'aboutbristol',
 'abouts',
 'abouve',
 'above',
 'abput',
 'abroad',
 'abs',
 'absconding',
 'absence',
 'absense',
 'absent',
 'absoloutely',
 'absoloutley',
 'absoltley',
 'absoluetely',
 'absolulty',
 'absolut',
 'absolute',
 'absolutelly',
 'absolutely',
 'absolutey',
 'absolutley',
 'absolutly',
 'absolutuely',
 'absorb',
 'absorbed',
 'absorbing',
 'absoulte',
 'absurd',
 'absurdly',
 'abt',
 'abundacne',
 'abundance',
 'abundant',
 'abuse',
 'abusing',
 'abusive',
 'abut',
 'abutting',
 'abw',
 'aby',
 'abysmal',
 'ac',
 'academic',
 'a

 'bouquet',
 'bourgeois',
 'bourhood',
 'bourne',
 'bournemouth',
 'bout',
 'boutique',
 'boutiques',
 'bow',
 'bowels',
 'bower',
 'bowie',
 'bowl',
 'bowled',
 'bowls',
 'box',
 'boxed',
 'boxers',
 'boxes',
 'boxing',
 'boxy',
 'boy',
 'boyfriend',
 'boyfriends',
 'boyhood',
 'boys',
 'bpvery',
 'br',
 'brac',
 'brace',
 'bracket',
 'brad',
 'braden',
 'bradley',
 'brain',
 'brainer',
 'brainier',
 'brains',
 'brainstormed',
 'brake',
 'bramley',
 'branch',
 'branches',
 'brand',
 'branded',
 'brandly',
 'brandon',
 'brands',
 'brass',
 'brasserie',
 'brasseye',
 'braucht',
 'bravas',
 'brave',
 'braves',
 'braving',
 'bravo',
 'braxi',
 'brazilian',
 'brch',
 'breach',
 'bread',
 'breads',
 'breafast',
 'break',
 'breakables',
 'breakaway',
 'breakdfast',
 'breakdown',
 'breaker',
 'breakf',
 'breakfast',
 'breakfasted',
 'breakfasthighly',
 'breakfasti',
 'breakfasting',
 'breakfaston',
 'breakfasts',
 'breakfastthank',
 'breakfastwe',
 'breakfasty',
 'breakfeasty',
 'breakfest',


 'curtains',
 'curtesy',
 'curve',
 'curved',
 'curves',
 'curving',
 'curzon',
 'cushion',
 'cushioned',
 'cushions',
 'cushy',
 'custom',
 'customer',
 'customers',
 'customised',
 'customs',
 'cut',
 'cutaway',
 'cute',
 'cutely',
 'cuteness',
 'cuter',
 'cutes',
 'cutest',
 'cutie',
 'cutleries',
 'cutlery',
 'cuts',
 'cutter',
 'cutting',
 'cuttings',
 'cuttlery',
 'cuty',
 'cuz',
 'cv',
 'cvery',
 'cx',
 'cya',
 'cycle',
 'cycled',
 'cyclepaths',
 'cycles',
 'cycleway',
 'cyclical',
 'cycling',
 'cyclist',
 'cyclists',
 'cyclops',
 'cynical',
 'cyppo',
 'cyrus',
 'da',
 'dab',
 'dachshund',
 'dad',
 'daddy',
 'dads',
 'daffodils',
 'daft',
 'dahl',
 'daily',
 'dailyfoodshops',
 'dairy',
 'daisy',
 'dalby',
 'dale',
 'damage',
 'damaged',
 'damages',
 'damaging',
 'damian',
 'damians',
 'damien',
 'damn',
 'damp',
 'damped',
 'dampen',
 'dampened',
 'dampener',
 'damper',
 'dampness',
 'damson',
 'dan',
 'dance',
 'dancer',
 'dancers',
 'dancing',
 'dand',
 'dandy',
 'danger',
 'd

 'favorites',
 'favors',
 'favour',
 'favourable',
 'favoured',
 'favouring',
 'favourite',
 'favourites',
 'favs',
 'fay',
 'faye',
 'fayre',
 'faze',
 'fazed',
 'fc',
 'fdj',
 'fdjer',
 'fds',
 'feal',
 'fealing',
 'fealt',
 'fear',
 'feared',
 'fears',
 'feasible',
 'feasibly',
 'feast',
 'feather',
 'feathers',
 'feature',
 'featured',
 'features',
 'featuring',
 'feautures',
 'feb',
 'febreeze',
 'febreze',
 'february',
 'fecese',
 'fed',
 'federica',
 'federico',
 'fedwa',
 'fee',
 'feed',
 'feedback',
 'feedbacks',
 'feeders',
 'feeding',
 'feeds',
 'feee',
 'feek',
 'feel',
 'feeling',
 'feelings',
 'feels',
 'feep',
 'fees',
 'feet',
 'feets',
 'feisty',
 'fel',
 'felafal',
 'felicity',
 'feline',
 'felines',
 'felipe',
 'felix',
 'feliz',
 'fell',
 'fella',
 'felling',
 'fellow',
 'felt',
 'feltvery',
 'felty',
 'female',
 'females',
 'feminine',
 'fence',
 'fenced',
 'fences',
 'fencing',
 'fender',
 'fenell',
 'fenella',
 'fenellas',
 'feng',
 'fennela',
 'fenomenal',
 'fer

 'impeccably',
 'impeding',
 'imperceptible',
 'imperfections',
 'imperial',
 'impersonal',
 'implements',
 'implied',
 'implies',
 'imply',
 'importance',
 'important',
 'importantly',
 'impose',
 'imposed',
 'imposing',
 'impossible',
 'impossibly',
 'imposter',
 'impractical',
 'impress',
 'impressd',
 'impressed',
 'impresses',
 'impression',
 'impressions',
 'impressive',
 'impressively',
 'impromptu',
 'improv',
 'improve',
 'improved',
 'improvement',
 'improvements',
 'improvers',
 'improves',
 'improving',
 'improvise',
 'imprtant',
 'impulse',
 'in',
 'ina',
 'inability',
 'inaccessible',
 'inaccuracy',
 'inaccurate',
 'inactivity',
 'inadequacy',
 'inadequate',
 'inadvertently',
 'inamazing',
 'inammered',
 'inappropriate',
 'inbetween',
 'inc',
 'incase',
 'incense',
 'incentive',
 'incessant',
 'incessantly',
 'inch',
 'inches',
 'incidence',
 'incident',
 'incidentals',
 'incidents',
 'incite',
 'incl',
 'inclebt',
 'inclement',
 'inclinations',
 'incline',
 'inclined',
 

 'memorial',
 'memories',
 'memorise',
 'memorobilia',
 'memory',
 'men',
 'mena',
 'mended',
 'mendip',
 'mendips',
 'mental',
 'mentally',
 'mention',
 'mentioned',
 'mentioning',
 'mentionned',
 'mentions',
 'menu',
 'menus',
 'meow',
 'meowy',
 'mer',
 'mercedes',
 'mercela',
 'merchant',
 'merchants',
 'merci',
 'mercure',
 'mere',
 'merely',
 'merges',
 'merichard',
 'meridan',
 'meridian',
 'merit',
 'merited',
 'mermaid',
 'mermelade',
 'merriment',
 'merry',
 'merryl',
 'merryls',
 'merryly',
 'merv',
 'merveilleuse',
 'mervin',
 'mervyn',
 'mervyns',
 'meryl',
 'mesmerising',
 'mess',
 'message',
 'messaged',
 'messages',
 'messaging',
 'messed',
 'messege',
 'messenge',
 'messenger',
 'messes',
 'messing',
 'messy',
 'met',
 'metal',
 'metaphor',
 'metaphors',
 'meter',
 'metered',
 'meters',
 'method',
 'methods',
 'meticulous',
 'meticulously',
 'meting',
 'metre',
 'metred',
 'metres',
 'metro',
 'metting',
 'meworth',
 'mews',
 'mexican',
 'mexicana',
 'mexico',
 'mezcal

 'pears',
 'peas',
 'peasy',
 'pebble',
 'pebbles',
 'peceful',
 'peckle',
 'peco',
 'peculiar',
 'ped',
 'pedantic',
 'pedestal',
 'pedestrian',
 'pedestrianised',
 'pedestrians',
 'pedro',
 'pee',
 'peek',
 'peeling',
 'peep',
 'peeps',
 'peer',
 'peers',
 'peesy',
 'pefect',
 'pefectly',
 'pefrectvplace',
 'peggy',
 'pegs',
 'pekin',
 'pembroke',
 'pembrokeshire',
 'pemits',
 'pen',
 'penalised',
 'penalty',
 'pence',
 'penchant',
 'pending',
 'penetrating',
 'penguin',
 'penguins',
 'penney',
 'pennies',
 'pennine',
 'penny',
 'pennys',
 'pens',
 'pensive',
 'pent',
 'penthouse',
 'penumbra',
 'peoble',
 'peoople',
 'people',
 'peopleamazing',
 'peopleand',
 'peoplebest',
 'peoplehaving',
 'peoples',
 'peopletowels',
 'peoplevening',
 'peops',
 'pepper',
 'peppercorns',
 'peppermint',
 'pepperoni',
 'peppers',
 'pepperthere',
 'per',
 'perceived',
 'percent',
 'perch',
 'perched',
 'perchthere',
 'percipient',
 'percy',
 'perectly',
 'perefctly',
 'perefectly',
 'perf',
 'perfact',

 'run',
 'runabout',
 'runch',
 'runcible',
 'rundown',
 'rung',
 'runned',
 'runner',
 'runners',
 'running',
 'runninga',
 'runs',
 'rupert',
 'ruperts',
 'rupsha',
 'rural',
 'rush',
 'rushed',
 'rushes',
 'rushing',
 'rusic',
 'russ',
 'russel',
 'russell',
 'russian',
 'rust',
 'rusted',
 'rustic',
 'rustique',
 'rustle',
 'rustling',
 'rusty',
 'ruth',
 'rutherford',
 'rwa',
 'rwc',
 'rx',
 'ryan',
 'ryanair',
 'rye',
 'rythm',
 'sa',
 'saam',
 'saams',
 'sab',
 'sabina',
 'sabine',
 'sabrina',
 'sac',
 'sach',
 'sachet',
 'sachets',
 'sachs',
 'sacrifice',
 'sacrificed',
 'sacrificing',
 'sad',
 'saddened',
 'sadly',
 'safah',
 'safari',
 'safe',
 'safekey',
 'safely',
 'safer',
 'safes',
 'safesally',
 'safest',
 'safesuper',
 'safety',
 'saffron',
 'saga',
 'sage',
 'saggy',
 'saghi',
 'sahara',
 'said',
 'sailboats',
 'sailing',
 'sailors',
 'sainbury',
 'sainsburies',
 'sainsburry',
 'sainsbury',
 'sainsburys',
 'saint',
 'saints',
 'sais',
 'sake',
 'sakky',
 'saksham',
 's

 'tales',
 'talk',
 'talkative',
 'talked',
 'talking',
 'talkling',
 'talks',
 'tall',
 'taller',
 'talling',
 'tallulah',
 'tally',
 'tam',
 'tamar',
 'tamara',
 'tamars',
 'tame',
 'tamik',
 'tamika',
 'tamikai',
 'tammi',
 'tammie',
 'tammy',
 'tampons',
 'tamra',
 'tamsin',
 'tandoor',
 'tangible',
 'tangled',
 'tango',
 'tank',
 'tanks',
 'tantalising',
 'tanx',
 'tanya',
 'tanyas',
 'tap',
 'tapas',
 'tape',
 'taped',
 'tapes',
 'tapping',
 'taproom',
 'taps',
 'tara',
 'tardiness',
 'tardis',
 'tare',
 'targeting',
 'taric',
 'tariffs',
 'tarik',
 'tariks',
 'tarim',
 'tariq',
 'tarnish',
 'tarnishing',
 'tarps',
 'tarts',
 'tasetful',
 'tash',
 'tasha',
 'task',
 'tasmania',
 'tasmin',
 'taste',
 'tasted',
 'tasteful',
 'tastefull',
 'tastefully',
 'tastely',
 'tastes',
 'tastful',
 'tastfully',
 'tastfuly',
 'tastiest',
 'tasting',
 'tasty',
 'tatami',
 'tattoo',
 'tatty',
 'taught',
 'tavern',
 'taverns',
 'taxi',
 'taxis',
 'taxithere',
 'tay',
 'taying',
 'taylor',
 'tb',


 'whiniest',
 'whining',
 'whip',
 'whipped',
 'whippet',
 'whipping',
 'whips',
 'whirlwind',
 'whish',
 'whishes',
 'whisked',
 'whiskey',
 'whisks',
 'whisper',
 'whist',
 'whistle',
 'whistled',
 'whistles',
 'whistlestop',
 'whistling',
 'whit',
 'whitchurch',
 'white',
 'whiteboard',
 'whitehall',
 'whiteladies',
 'whitelady',
 'whitelands',
 'whites',
 'whith',
 'whitladies',
 'whitleadies',
 'whitout',
 'whitty',
 'whizzes',
 'who',
 'whoever',
 'whole',
 'wholefood',
 'wholefoods',
 'wholehearted',
 'wholeheartedly',
 'wholemeal',
 'wholesome',
 'wholly',
 'whom',
 'whoop',
 'whopping',
 'whort',
 'whose',
 'whould',
 'whsmith',
 'whuteladies',
 'why',
 'wi',
 'wich',
 'wick',
 'wicked',
 'wicker',
 'wide',
 'widely',
 'wider',
 'widescreen',
 'widows',
 'width',
 'wieder',
 'wierd',
 'wiesia',
 'wif',
 'wife',
 'wifi',
 'wig',
 'wiggle',
 'wight',
 'wih',
 'wil',
 'wild',
 'wildebeest',
 'wilderness',
 'wildest',
 'wildlife',
 'wilds',
 'wildy',
 'wilk',
 'wilko',
 'wilks',
 

In [9]:
# Check the most common or frequent words
most_common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
most_common_words[:350]   # first 20 words

[('and', 186280),
 ('the', 154077),
 ('to', 93356),
 ('was', 73952),
 ('in', 64651),
 ('is', 63608),
 ('very', 59606),
 ('great', 49954),
 ('for', 47042),
 ('stay', 43553),
 ('we', 42815),
 ('of', 41167),
 ('with', 37414),
 ('place', 32685),
 ('it', 32402),
 ('location', 30508),
 ('lovely', 30187),
 ('you', 26730),
 ('bristol', 26587),
 ('clean', 25628),
 ('room', 22558),
 ('would', 21626),
 ('host', 20727),
 ('really', 19666),
 ('house', 19121),
 ('nice', 18825),
 ('had', 18530),
 ('comfortable', 18213),
 ('recommend', 18204),
 ('at', 17680),
 ('as', 17458),
 ('good', 16679),
 ('this', 16439),
 ('were', 16193),
 ('flat', 15887),
 ('again', 14698),
 ('our', 14624),
 ('from', 14350),
 ('on', 13921),
 ('but', 13856),
 ('us', 13653),
 ('so', 13059),
 ('home', 13057),
 ('everything', 12855),
 ('there', 12672),
 ('all', 12538),
 ('perfect', 12394),
 ('well', 12204),
 ('my', 11955),
 ('apartment', 11907),
 ('definitely', 11705),
 ('are', 11510),
 ('city', 11453),
 ('easy', 11165),
 ('friendl

In [10]:
# see the rare words 
rare_words = sorted(word_freq.items(), key=lambda x: x[1])   # NB: word_freq is a dictionary colloction
pprint(rare_words[:10])  # first 100 rare words

[('chauffeur', 1),
 ('ireland', 1),
 ('foreward', 1),
 ('boatload', 1),
 ('markus', 1),
 ('cellular', 1),
 ('hovered', 1),
 ('saunter', 1),
 ('arranges', 1),
 ('availablity', 1)]


In [11]:
# Build the bigram & trigram models  [the model creates 2 or 3 wordy, most common occuring  phares]
bigram = gensim.models.Phrases(fitered_english_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[fitered_english_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[fitered_english_words[0]]])

['our', 'stay', 'with', 'marcus', 'in', 'bristol', 'was', 'fantastic', 'in', 'every', 'way', 'he', 'was', 'great', 'host', 'picking', 'us', 'up', 'at', 'the', 'bus', 'stop', 'recommending', 'places', 'to', 'try', 'leaving', 'plenty', 'of', 'pastries', 'and', 'other', 'breakfast', 'items', 'to', 'enjoy', 'in', 'the', 'morning', 'the', 'flat', 'itself', 'was', 'modern', 'bright', 'clean', 'and', 'spacious', 'and', 'best', 'of', 'all', 'right', 'on', 'bristol', 'lovely', 'harbourside', 'we', 'will', 'definitely', 'stay', 'again', 'next', 'time', 'we', 're', 'in', 'bristol', 'thanks', 'again', 'marcus']


In [12]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
print('number of stop words:',len(stop_words))

# define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    data_out =[]
    for doc in texts:
            data_out.append([lemmatizer.lemmatize(word) for word in doc])
    return data_out 

number of stop words: 179


In [13]:
# remove the stopwords
data_words_nostops = remove_stopwords(fitered_english_words)
print("after removing stop words:",data_words_nostops[0])

# create bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
print("\nbigram data words:",data_words_bigrams[0])

#  lemmatization
lemmatized_data = lemmatize_texts(data_words_bigrams)
print('\n\nAfter lemmatization:',lemmatized_data[0:2])

after removing stop words: ['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'us', 'bus', 'stop', 'recommending', 'places', 'try', 'leaving', 'plenty', 'pastries', 'breakfast', 'items', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus']

bigram data words: ['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'us', 'bus', 'stop', 'recommending', 'places', 'try', 'leaving', 'plenty', 'pastries', 'breakfast', 'items', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus']


After lemmatization: [['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plen

In [14]:
# after removing stop words or pre-processing : check the most common and very the rare words
wf = get_words_frequency(lemmatized_data)
most_common_words = sorted(wf.items(), key=lambda x: x[1], reverse=True)
most_common_words[:20]   # first 20 words

[('great', 49961),
 ('stay', 43840),
 ('place', 35320),
 ('location', 30625),
 ('lovely', 30188),
 ('host', 28772),
 ('bristol', 26618),
 ('clean', 25632),
 ('room', 23979),
 ('would', 21626),
 ('really', 19666),
 ('house', 19255),
 ('nice', 18826),
 ('comfortable', 18213),
 ('recommend', 18204),
 ('good', 16698),
 ('flat', 16037),
 ('u', 13653),
 ('home', 13089),
 ('everything', 12855)]

In [15]:
# after removing stop words or pre-processing : check the rare words
wf = get_words_frequency(lemmatized_data)
rare_words = sorted(wf.items(), key=lambda x: x[1])
pprint(rare_words[:13500])   # first 20 words

[('chauffeur', 1),
 ('ireland', 1),
 ('foreward', 1),
 ('markus', 1),
 ('cellular', 1),
 ('hovered', 1),
 ('saunter', 1),
 ('arranges', 1),
 ('availablity', 1),
 ('macus', 1),
 ('beet', 1),
 ('fraternity', 1),
 ('spotlesly', 1),
 ('guast', 1),
 ('vegfest', 1),
 ('deny', 1),
 ('wecolming', 1),
 ('strives', 1),
 ('morethe', 1),
 ('lovliness', 1),
 ('marucs', 1),
 ('esuite', 1),
 ('ridley', 1),
 ('alien', 1),
 ('prequel', 1),
 ('prometheus', 1),
 ('porriage', 1),
 ('britol', 1),
 ('banksies', 1),
 ('folly', 1),
 ('olra', 1),
 ('recomnded', 1),
 ('habla', 1),
 ('espanol', 1),
 ('sociability', 1),
 ('neighborhoody', 1),
 ('reorganized', 1),
 ('ohh', 1),
 ('arnofolni', 1),
 ('uniform', 1),
 ('exhudes', 1),
 ('coexisting', 1),
 ('huffy', 1),
 ('invasion', 1),
 ('cherish', 1),
 ('unpictured', 1),
 ('fernando', 1),
 ('tractor', 1),
 ('giuliana', 1),
 ('plumped', 1),
 ('londoner', 1),
 ('newington', 1),
 ('cookie', 1),
 ('cutter', 1),
 ('vaya', 1),
 ('guapos', 1),
 ('tagged', 1),
 ('airbn', 1),


 ('compleely', 1),
 ('hae', 1),
 ('spur', 1),
 ('snooping', 1),
 ('bournemouth', 1),
 ('classier', 1),
 ('extremelyl', 1),
 ('passionfruit', 1),
 ('prochaine', 1),
 ('sloe', 1),
 ('unforgetable', 1),
 ('agiain', 1),
 ('alkis', 1),
 ('alsion', 1),
 ('writes', 1),
 ('bialetti', 1),
 ('etienne', 1),
 ('eston', 1),
 ('centrer', 1),
 ('calmful', 1),
 ('neigborhod', 1),
 ('hyggelig', 1),
 ('achieves', 1),
 ('pablos', 1),
 ('resrrstsurants', 1),
 ('pb', 1),
 ('characterised', 1),
 ('swapping', 1),
 ('reguralry', 1),
 ('wereawesome', 1),
 ('recommendstions', 1),
 ('towardsthe', 1),
 ('jarod', 1),
 ('beijing', 1),
 ('pekin', 1),
 ('likelihood', 1),
 ('smorgasbord', 1),
 ('wagon', 1),
 ('whipped', 1),
 ('umech', 1),
 ('ifneverything', 1),
 ('noticing', 1),
 ('igor', 1),
 ('promontory', 1),
 ('priciate', 1),
 ('snacky', 1),
 ('aasy', 1),
 ('availabality', 1),
 ('issuing', 1),
 ('itty', 1),
 ('bitty', 1),
 ('humidifier', 1),
 ('aggravate', 1),
 ('gardenswhich', 1),
 ('exetra', 1),
 ('marvelling', 

 ('counterpane', 1),
 ('bodily', 1),
 ('staion', 1),
 ('parkstreet', 1),
 ('dishtowel', 1),
 ('satchel', 1),
 ('toorecommend', 1),
 ('unscrew', 1),
 ('rescaling', 1),
 ('plumb', 1),
 ('imagery', 1),
 ('kitchinette', 1),
 ('caved', 1),
 ('scramble', 1),
 ('promoting', 1),
 ('income', 1),
 ('evolution', 1),
 ('commoditise', 1),
 ('proudness', 1),
 ('matalan', 1),
 ('offs', 1),
 ('zafira', 1),
 ('inamazing', 1),
 ('seeped', 1),
 ('donervan', 1),
 ('naming', 1),
 ('dustballs', 1),
 ('wrapper', 1),
 ('colander', 1),
 ('rep', 1),
 ('hme', 1),
 ('approve', 1),
 ('flatt', 1),
 ('adhd', 1),
 ('poncho', 1),
 ('lazyboy', 1),
 ('immigrated', 1),
 ('ardently', 1),
 ('traval', 1),
 ('sourdoughnuts', 1),
 ('twiglet', 1),
 ('reliant', 1),
 ('unwide', 1),
 ('acedemy', 1),
 ('cafeter', 1),
 ('accesories', 1),
 ('familiarise', 1),
 ('showerhead', 1),
 ('buidling', 1),
 ('horizontal', 1),
 ('caramel', 1),
 ('fiamma', 1),
 ('shuffle', 1),
 ('marthyna', 1),
 ('bussy', 1),
 ('zitto', 1),
 ('bevi', 1),
 ('emm

 ('sonas', 1),
 ('samayas', 1),
 ('idealic', 1),
 ('kindlyfamily', 1),
 ('jinx', 1),
 ('sply', 1),
 ('visittony', 1),
 ('respectfull', 1),
 ('instuctions', 1),
 ('keaton', 1),
 ('alister', 1),
 ('cafitiere', 1),
 ('placeits', 1),
 ('dispite', 1),
 ('crafty', 1),
 ('propertyand', 1),
 ('fantsatic', 1),
 ('niceness', 1),
 ('smarter', 1),
 ('starfished', 1),
 ('emission', 1),
 ('mth', 1),
 ('bottleof', 1),
 ('staystrongly', 1),
 ('occured', 1),
 ('painter', 1),
 ('expensice', 1),
 ('durably', 1),
 ('signposting', 1),
 ('narrowly', 1),
 ('hilli', 1),
 ('awakened', 1),
 ('pffh', 1),
 ('tschh', 1),
 ('wir', 1),
 ('bedd', 1),
 ('dampened', 1),
 ('deemed', 1),
 ('classed', 1),
 ('nella', 1),
 ('uickly', 1),
 ('calldefinitely', 1),
 ('heba', 1),
 ('eyesore', 1),
 ('staydefinitely', 1),
 ('bristolspotless', 1),
 ('hotline', 1),
 ('performer', 1),
 ('eagle', 1),
 ('incoherent', 1),
 ('dumbfounded', 1),
 ('unbelieved', 1),
 ('logo', 1),
 ('elastic', 1),
 ('astounding', 1),
 ('brita', 1),
 ('apeart

 ('downturn', 1),
 ('underselling', 1),
 ('bossed', 1),
 ('raeda', 1),
 ('vacacion', 1),
 ('spankingly', 1),
 ('tiered', 1),
 ('dalby', 1),
 ('millinneal', 1),
 ('writhing', 1),
 ('shoved', 1),
 ('affix', 1),
 ('unseasonably', 1),
 ('sorcha', 1),
 ('forfeited', 1),
 ('mondern', 1),
 ('noticethat', 1),
 ('loccation', 1),
 ('begginig', 1),
 ('privacity', 1),
 ('ineptitude', 1),
 ('crud', 1),
 ('polo', 1),
 ('shield', 1),
 ('amnetities', 1),
 ('quriky', 1),
 ('geroge', 1),
 ('fatigue', 1),
 ('nikkis', 1),
 ('supermarketsfew', 1),
 ('gagety', 1),
 ('gravel', 1),
 ('appalled', 1),
 ('decency', 1),
 ('denial', 1),
 ('innumerable', 1),
 ('lipstick', 1),
 ('fundamental', 1),
 ('diferent', 1),
 ('pert', 1),
 ('mariead', 1),
 ('suz', 1),
 ('processed', 1),
 ('cf', 1),
 ('quirkiest', 1),
 ('dickens', 1),
 ('leek', 1),
 ('smh', 1),
 ('classmate', 1),
 ('kickboard', 1),
 ('photoshoot', 1),
 ('silicone', 1),
 ('bakeware', 1),
 ('skoda', 1),
 ('sapartment', 1),
 ('wonderfulstay', 1),
 ('blighted', 1)

 ('slashed', 2),
 ('unused', 2),
 ('enyoyed', 2),
 ('samirs', 2),
 ('cresentia', 2),
 ('machinery', 2),
 ('remy', 2),
 ('weaker', 2),
 ('unwilling', 2),
 ('darker', 2),
 ('stilysh', 2),
 ('hack', 2),
 ('blustery', 2),
 ('desireable', 2),
 ('felipe', 2),
 ('nancy', 2),
 ('restocking', 2),
 ('wasnot', 2),
 ('bungee', 2),
 ('persistent', 2),
 ('cattle', 2),
 ('surelly', 2),
 ('firestick', 2),
 ('decour', 2),
 ('uno', 2),
 ('lindons', 2),
 ('thanksgiving', 2),
 ('magda', 2),
 ('livingston', 2),
 ('dumped', 2),
 ('comadation', 2),
 ('virginia', 2),
 ('malgorzata', 2),
 ('retribution', 2),
 ('lissie', 2),
 ('dora', 2),
 ('naheed', 2),
 ('federico', 2),
 ('lifelong', 3),
 ('mastered', 3),
 ('awaken', 3),
 ('colouring', 3),
 ('amphitheatre', 3),
 ('impacted', 3),
 ('appletv', 3),
 ('contribute', 3),
 ('airbnbing', 3),
 ('orlas', 3),
 ('switzerland', 3),
 ('margot', 3),
 ('reflective', 3),
 ('excepted', 3),
 ('deodorant', 3),
 ('scrub', 3),
 ('mick', 3),
 ('ciao', 3),
 ('borough', 3),
 ('rome',

In [16]:
def remove_rare_words(texts,words_freq):
    '''removes the rare words'''
    words_data = [[w for w in doc if words_freq[w]>7] for doc in texts]
    return words_data


In [17]:
# remove the rare words from texts
words_data = remove_rare_words(lemmatized_data,wf)
pprint(words_data[:100])

[['stay',
  'marcus',
  'bristol',
  'fantastic',
  'every',
  'way',
  'great',
  'host',
  'picking',
  'u',
  'bus',
  'stop',
  'recommending',
  'place',
  'try',
  'leaving',
  'plenty',
  'pastry',
  'breakfast',
  'item',
  'enjoy',
  'morning',
  'flat',
  'modern',
  'bright',
  'clean',
  'spacious',
  'best',
  'right',
  'bristol',
  'lovely',
  'harbourside',
  'definitely',
  'stay',
  'next',
  'time',
  'bristol',
  'thanks',
  'marcus'],
 ['marcus',
  'brilliant',
  'warm',
  'friendly',
  'host',
  'picked',
  'u',
  'railway_station',
  'took',
  'anne',
  'doctor',
  'drove',
  'u',
  'around',
  'wherever',
  'needed',
  'go',
  'bristol',
  'dropped',
  'u',
  'back',
  'railway_station',
  'leaving',
  'flat',
  'modern',
  'comfortable',
  'clean',
  'well',
  'heated',
  'marcus',
  'provided',
  'u',
  'everything',
  'could',
  'wish',
  'wish',
  'could',
  'stayed',
  'longer'],
 ['mum',
  'angela',
  'stayed',
  'marcus',
  'amazing',
  'apartment',
  'tw

  'going',
  'made',
  'first',
  'stay',
  'air_bnb',
  'pleasant',
  'experience',
  'know',
  'bristol',
  'well',
  'provide',
  'information',
  'go',
  'direction',
  'within',
  'city',
  'apartment',
  'spacious',
  'bedroom',
  'provided',
  'comfortable',
  'great',
  'ensuite',
  'shower',
  'room'],
 ['fact',
  'marcus',
  'awesome',
  'host',
  'welcoming',
  'helpful',
  'lovely',
  'apartment',
  'clean',
  'great',
  'location',
  'would',
  'definitely',
  'recommend'],
 ['marcus',
  'friendly',
  'met',
  'u',
  'arrival',
  'let',
  'u',
  'use',
  'garage',
  'made',
  'u',
  'feel',
  'welcome',
  'room',
  'clean',
  'lovely',
  'comfy',
  'bed',
  'big',
  'ensuite',
  'shower',
  'room',
  'area',
  'close',
  'due',
  'dinner',
  'able',
  'walk',
  'back',
  'easy'],
 ['one',
  'best',
  'best',
  'airbnb',
  'ever',
  'went',
  'marcus',
  'great',
  'host',
  'communication',
  'easy',
  'took',
  'u',
  'airport',
  'car',
  'made',
  'check',
  'really',
 

In [18]:
# create dictionary
id2word = corpora.Dictionary(words_data)
print("number of keys:",len(id2word.keys()))
# view the first word of the dictionary
print(id2word[0])   # see word with key =0

# create corpus
corpus_texts= words_data
print(corpus_texts[:3])

# bag of words or vector spaces
corpus = [id2word.doc2bow(text) for text in corpus_texts]
print(corpus[:2])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

number of keys: 5785
best
[['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'u', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plenty', 'pastry', 'breakfast', 'item', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'definitely', 'stay', 'next', 'time', 'bristol', 'thanks', 'marcus'], ['marcus', 'brilliant', 'warm', 'friendly', 'host', 'picked', 'u', 'railway_station', 'took', 'anne', 'doctor', 'drove', 'u', 'around', 'wherever', 'needed', 'go', 'bristol', 'dropped', 'u', 'back', 'railway_station', 'leaving', 'flat', 'modern', 'comfortable', 'clean', 'well', 'heated', 'marcus', 'provided', 'u', 'everything', 'could', 'wish', 'wish', 'could', 'stayed', 'longer'], ['mum', 'angela', 'stayed', 'marcus', 'amazing', 'apartment', 'two', 'week', 'august', 'relocating', 'bristol', 'lovely', 'experience', 'host', 'apartment', 'extremely', 'confortable', 'located', 'nice', '

[[('best', 1),
  ('breakfast', 1),
  ('bright', 1),
  ('bristol', 3),
  ('bus', 1),
  ('clean', 1),
  ('definitely', 1),
  ('enjoy', 1),
  ('every', 1),
  ('fantastic', 1),
  ('flat', 1),
  ('great', 1),
  ('harbourside', 1),
  ('host', 1),
  ('item', 1),
  ('leaving', 1),
  ('lovely', 1),
  ('marcus', 2),
  ('modern', 1),
  ('morning', 1),
  ('next', 1),
  ('pastry', 1),
  ('picking', 1),
  ('place', 1),
  ('plenty', 1),
  ('recommending', 1),
  ('right', 1),
  ('spacious', 1),
  ('stay', 2),
  ('stop', 1),
  ('thanks', 1),
  ('time', 1),
  ('try', 1),
  ('u', 1),
  ('way', 1)]]

How does LDA work or converge?
`
For each document d, compute P( topic t | document d ) := proportion of words in document d that are assigned to topic t
For each topic t, P( word w | topic t ) := proportion of assignments to topic t that come from word w (across all documents)
For each word w, reassign topic t’, where we choose topic t’ with probability P( topic t’ | word w ) = P( topic t’ | document d ) * P( word w | topic t’ )`

In [19]:
import time
start_time = time.time()
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

done_time = time.time()

pprint(" %.3f secs" % (done_time - start_time))

' 300.139 secs'


In [20]:
# Print the 10 keyword in topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.249*"nice" + 0.219*"good" + 0.062*"value" + 0.043*"shower" + '
  '0.040*"people" + 0.036*"money" + 0.012*"window" + 0.010*"service" + '
  '0.010*"drink" + 0.010*"load"'),
 (1,
  '0.077*"bit" + 0.044*"milk" + 0.040*"booking" + 0.030*"bread" + '
  '0.030*"heating" + 0.025*"another" + 0.023*"fridge" + 0.022*"outstanding" + '
  '0.021*"inside" + 0.020*"others"'),
 (2,
  '0.104*"clean" + 0.094*"lovely" + 0.072*"really" + 0.060*"room" + '
  '0.058*"house" + 0.056*"comfortable" + 0.037*"bed" + 0.034*"home" + '
  '0.030*"friendly" + 0.026*"communication"'),
 (3,
  '0.089*"coffee" + 0.071*"couple" + 0.063*"tea" + 0.039*"especially" + '
  '0.033*"group" + 0.033*"sofa" + 0.024*"decor" + 0.024*"wanted" + '
  '0.023*"basic" + 0.023*"slept"'),
 (4,
  '0.049*"well" + 0.045*"easy" + 0.041*"city" + 0.040*"space" + 0.038*"centre" '
  '+ 0.032*"area" + 0.030*"close" + 0.030*"walk" + 0.028*"quiet" + '
  '0.027*"check"'),
 (5,
  '0.054*"u" + 0.035*"also" + 0.032*"kitchen" + 0.024*"could" + 0.024*

` NB: next steps (a)further preprocesing texts,like removing the firsrt few most common words,(b)interpretation of the topics,(c)visualizing the topics clusters, and (d) documents ranking...`
--------------------
`references`: gensim documentation 