In [1]:
# import libraries we'll use
import spacy # fast NLP
import pandas as pd # dataframes
import langid # language identification (i.e. what language is this?)
from nltk.classify.textcat import TextCat # language identification from NLTK
from matplotlib.pyplot import plot # not as good as ggplot in R :p

In [2]:
AirData=pd.read_csv("reviews.csv")

In [3]:
AirData["comments"]=AirData["comments"].astype("str")

In [8]:
f=open("The_doc.txt",'a')
f.writelines(['\n', str(AirData["comments"])])
f.close()

In [9]:
ids_langid = AirData["comments"].apply(langid.classify)

# get just the language label
langs = ids_langid.apply(lambda tuple: tuple[0])

# how many unique language labels were applied?
print("Number of tagged languages (estimated):")
print(len(langs.unique()))

# percent of the total dataset in English
print("Percent of data in English (estimated):")
print((sum(langs=="en")/len(langs))*100)

Number of tagged languages (estimated):
49
Percent of data in English (estimated):
95.1476374041251


In [10]:
English_comment=AirData["comments"][langs=='en']

In [11]:
len(English_comment)

95768

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(English_comment.values.astype('U'))

In [13]:
doc_term_matrix

<95768x12472 sparse matrix of type '<class 'numpy.int64'>'
	with 1618138 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [15]:
first_topic = LDA.components_[0]

In [16]:
top_topic_words = first_topic.argsort()[-10:]

In [17]:
for i in top_topic_words:#words with highest probabilities
    print(count_vect.get_feature_names()[i])

good
host
house
clean
lovely
place
nice
great
stay
room


In [20]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pandas as pd
import gensim
import pyLDAvis.gensim

In [21]:
pattern = r'\b[^\d\W]+\b'
tokenizer = RegexpTokenizer(pattern)
en_stop = get_stop_words('en')
lemmatizer = WordNetLemmatizer()

In [25]:
# list for tokenized documents in loop
texts = []
for i in English_comment.iteritems():
    # clean and tokenize document string
    raw = str(i[1]).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [raw for raw in tokens if not raw in en_stop]
    
    # remove stop words from tokens
    #stopped_tokens_new = [raw for raw in stopped_tokens if not raw in remove_words]
    
    # lemmatize tokens
    lemma_tokens = [lemmatizer.lemmatize(tokens) for tokens in stopped_tokens]
    
    # remove word containing only single char
    new_lemma_tokens = [raw for raw in lemma_tokens if not len(raw) == 1]
    
    # add tokens to list
    texts.append(new_lemma_tokens)

# sample data
print(texts[0])

['stay', 'marcus', 'bristol', 'fantastic', 'every', 'way', 'great', 'host', 'picking', 'bus', 'stop', 'recommending', 'place', 'try', 'leaving', 'plenty', 'pastry', 'breakfast', 'item', 'enjoy', 'morning', 'flat', 'modern', 'bright', 'clean', 'spacious', 'best', 'right', 'bristol', 'lovely', 'harbourside', 'will', 'definitely', 'stay', 'next', 'time', 're', 'bristol', 'thanks', 'marcus']


In [26]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [27]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=15, id2word = dictionary, passes=20)
import pprint
pprint.pprint(ldamodel.top_topics(corpus,topn=5))

[([(0.13236739, 'great'),
   (0.09151317, 'stay'),
   (0.07870538, 'location'),
   (0.06933419, 'place'),
   (0.035454422, 'lovely')],
  -1.117900633597799),
 ([(0.16229437, 'home'),
   (0.09399064, 'feel'),
   (0.078136645, 'made'),
   (0.068232656, 'welcome'),
   (0.05063699, 'host')],
  -1.167171974436044),
 ([(0.0637953, 'host'),
   (0.057270795, 'great'),
   (0.052149408, 'clean'),
   (0.046336748, 'recommend'),
   (0.04298988, 'room')],
  -1.3575514382791576),
 ([(0.07878998, 'touch'),
   (0.06279669, 'breakfast'),
   (0.041320503, 'coffee'),
   (0.033362906, 'tea'),
   (0.028643062, 'lovely')],
  -1.3729581681238427),
 ([(0.05552737, 'walk'),
   (0.04564894, 'centre'),
   (0.035886075, 'city'),
   (0.035134688, 'close'),
   (0.03029118, 'minute')],
  -1.5407864228030164),
 ([(0.045859598, 'stay'),
   (0.03133759, 'place'),
   (0.028275153, 'will'),
   (0.024770802, 'back'),
   (0.024391266, 'bristol')],
  -1.6266705230696548),
 ([(0.07814484, 'bristol'),
   (0.051285274, 'great'

In [28]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)