In [11]:
import numpy as np
import pandas as pd
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
import matplotlib.pyplot as plt
import pickle

In [3]:
data = pd.read_csv('./part_reviews_fin.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,rating,text,text_cleaned
0,0,0031 Boutique Hotel Restaurant,5.0,At 0031 you come to the right place. Everythin...,come right place everyth wish otherwis owner m...
1,1,0031 Boutique Hotel Restaurant,5.0,Just came back from my trip to Cumbuco. It was...,came back trip cumbuco one best holiday ever p...
2,2,0031 Boutique Hotel Restaurant,5.0,I was 6 weeks in Cumbuco. 0031 is the best pla...,week cumbuco best place stay great room nice b...
3,3,0031 Boutique Hotel Restaurant,5.0,everything was very nice. we had a lovely stay...,everyth nice love stay manag janet superkind w...
4,4,0031 Boutique Hotel Restaurant,5.0,For the third time in a row we've been to this...,third time row weve great pousada cumbuco say ...


In [18]:
lem = WordNetLemmatizer()
def preprocess(txt):
    text = txt.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    text_p = re.sub(r'[0-9]','',text_p)
    words = word_tokenize(text_p)
    
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    
    
    stemmed = [lem.lemmatize(word) for word in filtered_words]
    return stemmed

In [19]:
data['text'][:10].apply(preprocess)

0    [come, right, place, everything, wish, otherwi...
1    [came, back, trip, cumbuco, one, best, holiday...
2    [week, cumbuco, best, place, stay, great, room...
3    [everything, nice, lovely, stay, manager, jane...
4    [third, time, row, weve, great, pousada, cumbu...
5    [small, charming, design, hotel, beautiful, ne...
6    [oasis, beautiful, dune, behind, beach, minute...
7    [ive, visiting, cumbuco, since, keep, coming, ...
8    [stayed, feb, cumbuco, brazil, place, surfkite...
9    [hesitate, visit, jannecke, roel, going, find,...
Name: text, dtype: object

In [20]:
data['text_cleaned_lem']= data['text'][:100000].apply(preprocess)

In [21]:
test = pd.DataFrame({'text':data['text_cleaned_lem']})
test.to_csv('')

In [18]:
data['text_cleaned'] = data['text_cleaned'].apply(lambda x : " ".join(x))

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,name,rating,text,text_cleaned
0,0,0031 Boutique Hotel Restaurant,5.0,At 0031 you come to the right place. Everythin...,come right place everyth wish otherwis owner m...
1,1,0031 Boutique Hotel Restaurant,5.0,Just came back from my trip to Cumbuco. It was...,came back trip cumbuco one best holiday ever p...
2,2,0031 Boutique Hotel Restaurant,5.0,I was 6 weeks in Cumbuco. 0031 is the best pla...,week cumbuco best place stay great room nice b...
3,3,0031 Boutique Hotel Restaurant,5.0,everything was very nice. we had a lovely stay...,everyth nice love stay manag janet superkind w...
4,4,0031 Boutique Hotel Restaurant,5.0,For the third time in a row we've been to this...,third time row weve great pousada cumbuco say ...


In [11]:
vec = CountVectorizer()
cv = vec.fit_transform(data['text_cleaned'][:100000])

In [12]:
with open('cv.pkl','wb') as f:
    pickle.dump(vec,f)

In [35]:
lda_model = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(cv)

In [22]:

for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([vec.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')


THE TOP 15 WORDS FOR TOPIC #0
['deer', 'feb', 'wing', 'mexican', 'coloni']


THE TOP 15 WORDS FOR TOPIC #1
['definitli', 'didn', 'kl', 'carolin', 'bush']


THE TOP 15 WORDS FOR TOPIC #2
['sara', 'camelot', 'cleveland', 'savannah', 'exmoor']


THE TOP 15 WORDS FOR TOPIC #3
['ali', 'redwood', 'war', 'truth', 'sam']


THE TOP 15 WORDS FOR TOPIC #4
['goodth', 'asap', 'ragusa', 'expressway', 'rm']


THE TOP 15 WORDS FOR TOPIC #5
['awhil', 'pa', 'gulf', 'du', 'restaurantbar']


THE TOP 15 WORDS FOR TOPIC #6
['merida', 'overcharg', 'understaf', 'sooth', 'dar']


THE TOP 15 WORDS FOR TOPIC #7
['dr', 'lanka', 'hammock', 'sri', 'india']


THE TOP 15 WORDS FOR TOPIC #8
['taksim', 'swimmingpool', 'mosqu', 'istanbul', 'golden']


THE TOP 15 WORDS FOR TOPIC #9
['staff', 'good', 'stay', 'room', 'hotel']




In [4]:
def get_n_topic_words(lda_model , n_top_words , topic_idx):
  n_topic_words = []
  vocab = vec.get_feature_names()
  topic_words_idx = np.argsort(lda_model.components_[topic_idx])[::-1][:n_top_words]
  n_topic_words = [vocab[i] for i in topic_words_idx]
  return n_topic_words

In [5]:
def get_max_topics(probabilities,number_of_topics):
  probabilities = [[index,prob] for index,prob in enumerate(probabilities)]
  probabilities.sort(key = lambda k :k[1],reverse = True)
  probabilities = [i[0] for i in probabilities]
  return probabilities[:number_of_topics]

def get_review_asbects(review , lda_model):
  transformed_review = vec.transform(review)
  prob = lda_model.transform(transformed_review)
  prob = prob.tolist()[0]
  best_topics = get_max_topics(prob,3)
  aspects = []
  top_topic_words = []
  for topic in best_topics:
    x = get_n_topic_words(lda_model,10,topic)
    top_topic_words.extend(x)
  for word in top_topic_words:
    if word in review:
      aspects.append(word)
  return list(set(aspects))

In [8]:
text = data['text_cleaned'][1000010] 
text = preprocess(text)

text

['room',
 'clean',
 'found',
 'huge',
 'spider',
 'bathroom',
 'rail',
 'tub',
 'scari',
 'handicap',
 'sister',
 'inform',
 'handicap',
 'room',
 'good',
 'stop',
 'night',
 'would',
 'stay',
 'one',
 'night']

In [72]:
get_review_asbects(text,lda_model)

['would', 'room', 'clean', 'good', 'stay', 'night', 'one']

In [43]:
with open('lda_cv.pkl','wb') as f:
    pickle.dump((lda_model,lda_output),f)

In [10]:
vec.transform(['test'])

<1x580958 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>