# Yelp Data Clustering Notebook
In this notebook, we are interested in performing clustering to produce the top K words for each cluster.

Logic
1. Read the Review.csv file and convert to Pandas dataframe
2. For each review, split into sentences
3. Do preprocessing on each of the sentences
  - Convert to lowercase
  - Remove weird symbols (Regex patterns)
  - Stopwords
  - Stemming
4. Compute TFIDF matrix using vectorizer for each of the sentences
5. Cluster the sentences (K-means VS Hierarchial clustering)
6. Obtain top 10 words to categorize clusters (Price/Quality/Ambience etc)
7. Publish results via Dashboard on Tableau

## 1. Reading Data
We first have to read the Yelp data. This data is in the csv file: "Review.csv"

In [1]:
import numpy as np
import pandas as pd

#Import the data file
data = pd.read_csv("Review.csv")

#Preview the data
data.head()

Unnamed: 0,BusinessId,CoolCount,FunnyCount,NotRecommended,Rating,Text,UsefulCount,UserId,_id
0,long-beach-seafood-singapore,1,0,False,5,Been coming here for more than a decade. Long ...,1,mMjxhRn4h0LD1_jI3RT4cQ,ENx8ZXpulmX5_AfCOp_U3A
1,long-beach-seafood-singapore,3,0,False,5,I've been coming here with my boyfriend to onl...,4,tAr3zFVXoM1K2PrbyCdcTA,Tjd32RIUDeOJrYpJ9J6_GA
2,long-beach-seafood-singapore,2,1,False,5,MUST GO! Your life isn't complete without goin...,1,1-5YghuD_7sLuD01f6hrJw,wYuRHgUSBTm6D8B-HrSpNg
3,long-beach-seafood-singapore,0,0,False,3,"Excellent service, our waiter was always aroun...",0,ZAaT3T_Yd1re0Nt_MOW7AA,lisryzBzdpf4KNXfx6-3Fg
4,long-beach-seafood-singapore,0,0,False,4,Try the Chilli crab. Its gravy was on the swee...,1,oRMf6hs2lgU3bIr_fF_5IA,thceOS2dozAixh8Qu09gOg


### Filter out the columns that we are interested in

This includes _id, UserId, BusinessId, Rating, Text from the review table

--> Store it in a dataframe object

In [2]:
#Select only those important columns for our project
df1 = data[['_id','UserId','BusinessId','Rating','Text']]
print(df1)


                          _id                  UserId  \
0      ENx8ZXpulmX5_AfCOp_U3A  mMjxhRn4h0LD1_jI3RT4cQ   
1      Tjd32RIUDeOJrYpJ9J6_GA  tAr3zFVXoM1K2PrbyCdcTA   
2      wYuRHgUSBTm6D8B-HrSpNg  1-5YghuD_7sLuD01f6hrJw   
3      lisryzBzdpf4KNXfx6-3Fg  ZAaT3T_Yd1re0Nt_MOW7AA   
4      thceOS2dozAixh8Qu09gOg  oRMf6hs2lgU3bIr_fF_5IA   
5      7yAZ_47K_aJQlWGM79mYCA  oeAhRa8yFa9jtrhaHnOyxQ   
6      NtBCMPJH_n62YA9Zy83jlw  8aBKh52ePGd3OG8di20wdw   
7      5uOt1hm1LxR-H2NJEs1Vgw  pMltyeuU4SjT6Y8YVy7HBA   
8      qWZ0yoP48ZLoGvj9QVc4TA  fZaWWxwwHGtH-8Vhx7qE_Q   
9      MuIvIPQiYBXJDhicWjSX_g  m7rbZtL4b8du326Ng43SiA   
10     EphGs01wv824ChsIm-bNxA  Q1oMmm7tKPOzA7_gqcV4zg   
11     sVMv2XaWvEn3gGi8dnO3lA  Vtdm-QpN5yYxT-O00cuqFg   
12     Z79DT0ECtm-1YDXFE9fEzA  wwdamcY73iJTyhl-3gGOJw   
13     LeFrcXzAtIG26HKtYdpD6w  11R2R4nyRpF6nqlXH-JeGQ   
14     i57jsdlQtxiSYWovohElAg  ITWi6Z_nZ8mmiSzpSUrDKg   
15     CQ1njdQNZV-Z06VwDeC_5w  3S53VaMDPy8Lywk2TuPb4A   
16     mbIUMsDU3CjAcq9bgwfwrA  

### Create data folder for easy processing
Data are stored as lists

In [3]:
# Make a directory to store the corpus files
# Creating individual review files 
# Need to delete Data folder before adding the files again**
import os
import shutil
import timeit
from nltk.tokenize import sent_tokenize
import collections

def createFolder(directory):
    try:
        if os.path.exists(directory):
            shutil.rmtree(directory)
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
        

# Create a folder as a corpus
# Creates a folder in the current directory called Data
createFolder('./Data/')

allSentenceData = []
allSentenceReviewId = []
allSentenceBusinessId = []
allSentenceRating = []

reviewSentenceList = []

def CreateCorpusFromDataFrame(corpusfolder,df1):
    
    reviewPartDictionary = collections.OrderedDict();
    
    
    for index, r in df1.iterrows():
        reviewId = r['_id']
        businessId = r['BusinessId']
        userId = r['UserId']
        rating = r['Rating']
        body = r['Text']
        
        sentenceList = sent_tokenize(body)
        numberOfSentences = len(sentenceList)
        
        reviewPartDictionary[str(reviewId)] = numberOfSentences;
        
        for i in range(0, numberOfSentences):
            fname=str(reviewId) + '#' + str(i) + '.txt'
            reviewSentenceList.append(fname)
            corpusfile=open(corpusfolder+'/'+fname,'a')
            corpusfile.write(sentenceList[i])
            corpusfile.close()
            allSentenceData.append(sentenceList[i])
            allSentenceReviewId.append(reviewId)
            allSentenceBusinessId.append(businessId)
            allSentenceRating.append(rating)
            
        
    return reviewPartDictionary
        
start_time = timeit.default_timer()
reviewDict = CreateCorpusFromDataFrame('./Data',df1)
elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

Time Taken: 32.239083377004135


In [4]:
#Function To Preprocess allSentenceData
import nltk
import re
import timeit
start_time = timeit.default_timer()

stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('^[a-z]+$', token.lower()):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('^[a-z]+$', token.lower()):
            filtered_tokens.append(token)
    return filtered_tokens

elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

Time Taken: 0.0030276400066213682


In [5]:
#
#  Dataframe of vocabulary - Stemmed, tokenized
#

import timeit
start_time = timeit.default_timer()

totalvocab_stemmed = []
totalvocab_tokenized = []
for sentence in allSentenceData:
    allwords_stemmed = tokenize_and_stem(sentence) # for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(sentence)
    totalvocab_tokenized.extend(allwords_tokenized)
    
print("=====  Tonkenized ==== ")
print(len(totalvocab_tokenized))

print("=====  Stemmed ==== ")
print(len(totalvocab_stemmed))  

vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

# print(vocab_frame)


elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

=====  Tonkenized ==== 
1893531
=====  Stemmed ==== 
1893531
Time Taken: 74.23346280799888


In [6]:
#Testing out other python library with kmeans and viz
#https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html
#https://www.kaggle.com/dipikabaad0107/elbow-curve-for-text-clustering
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import nltk
import re
import pylab as pl

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA 

from sklearn.feature_extraction.text import TfidfVectorizer

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#Small data set
dataLimit = None
smallData = allSentenceData[0:dataLimit]

vectorizer = TfidfVectorizer(stop_words='english',use_idf=True,tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = vectorizer.fit_transform(smallData)
# print(tfidf_matrix.shape)
# print(type(tfidf_matrix))


terms = vectorizer.get_feature_names() #Get all the features / vocab i think

tfidf_dataframe = pd.DataFrame(tfidf_matrix.toarray(), columns = terms)
print(tfidf_dataframe)





         aa  aa friend  aa good  aaaaaaaaaaaaargh  aaaaaaaaaaaaargh minut  \
0       0.0        0.0      0.0               0.0                     0.0   
1       0.0        0.0      0.0               0.0                     0.0   
2       0.0        0.0      0.0               0.0                     0.0   
3       0.0        0.0      0.0               0.0                     0.0   
4       0.0        0.0      0.0               0.0                     0.0   
5       0.0        0.0      0.0               0.0                     0.0   
6       0.0        0.0      0.0               0.0                     0.0   
7       0.0        0.0      0.0               0.0                     0.0   
8       0.0        0.0      0.0               0.0                     0.0   
9       0.0        0.0      0.0               0.0                     0.0   
10      0.0        0.0      0.0               0.0                     0.0   
11      0.0        0.0      0.0               0.0                     0.0   

## Computing of Elbow Curve

In [None]:

# https://www.kaggle.com/dipikabaad0107/elbow-curve-for-text-clustering
import timeit
start_time = timeit.default_timer()

distortions = []
K = range(1,11)
for i in K:
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300,n_init=10,random_state=0)
    kmeans.fit(tfidf_matrix)
    distortions.append(kmeans.inertia_)
pl.plot(K,distortions)
pl.title('ELBOW')
pl.xlabel('Number of Clusters')
pl.ylabel('distortions')
pl.show()

elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

# Clustering Methods
## (A) K-Means Clustering
Based on the above elbow curve, we determine the optimal number to be 6 clusters

In [None]:
# Based on the elbow method, I determine the optimal number to be 4/5

from sklearn.cluster import KMeans

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 8min 11s, sys: 20.9 s, total: 8min 32s
Wall time: 4min 25s


In [None]:
# USE THIS TO PRINT CLUSTER + REVIEW DATA
for i in range(0 , len(smallData)):
    print("Cluster: " + str(clusters[i]) +" " + smallData[i])

In [None]:
# ===================
# K MEANS CLUSTERING
# ===================

import pandas as pd


reviews = { 'businessId': allSentenceBusinessId[0:dataLimit], 'rating': allSentenceRating[0:dataLimit], 'sentence': allSentenceData[0:dataLimit], 'sentenceNumber': allSentenceDataSentenceNum, 'reviewId':allSentenceReviewId[0:dataLimit], 'cluster': clusters[0:dataLimit] }

frame = pd.DataFrame(reviews, index = [clusters[0:dataLimit]] , columns = ['rating', 'sentence', 'sentenceNumber', 'reviewId', 'businessId', 'cluster'])

os.remove("output.csv")
for index, row in frame.iterrows():
    rowFrame = row.to_frame()
    rowFrame = rowFrame.transpose()
    rowFrame.to_csv("output.csv", mode='a', header=False)
    
print("DONE")

DONE


In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms_words = []
terms_values = {}

for i in range(num_clusters):
    print("Loading Cluster %d words..." %i)
    wordList = [vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0] for ind in order_centroids[i, :10]]
    terms_values[i] = wordList

terms_df = pd.DataFrame(terms_values)
FILE_NAME = "output2.csv"
print("Exporting to csv file: %s" %FILE_NAME)

terms_df.to_csv(FILE_NAME)

print("Complete")
    

Top terms per cluster:

Loading Cluster 0 words...

Loading Cluster 1 words...

Loading Cluster 2 words...

Loading Cluster 3 words...

Loading Cluster 4 words...

Loading Cluster 5 words...

Exporting to csv file: output2.csv
Complete


### Compute Top Words
#### Method 1: TFIDF

In [None]:
cluster_frequency_dict = {}

for i in range(num_clusters):

    sub_frame = frame.loc[frame['cluster'] == i]
    sub_data = sub_frame['sentence'].tolist()

    vectorizer = TfidfVectorizer(stop_words='english',use_idf=True,tokenizer=tokenize_and_stem)
    tfidf_matrix = vectorizer.fit_transform(sub_data)

    feature_name_list = vectorizer.get_feature_names()

    tfidf_dataframe = pd.DataFrame(tfidf_matrix.toarray(), columns = feature_name_list)


    count_frame = sub_frame.sentence.str.split(expand=True).stack().value_counts()

    indices = np.argsort(vectorizer.idf_)[::-1]
    features = vectorizer.get_feature_names()
    top_n = 10
    top_features = [features[i] for i in indices[:top_n]]
    
    cluster_frequency_dict[i] = top_features

cluster_frequency_df = pd.DataFrame(cluster_frequency_dict)
print(cluster_frequency_df)

FILE_NAME = "output_tfidf.csv"
print("Exporting to csv file: %s" %FILE_NAME)

cluster_frequency_df.to_csv(FILE_NAME)

print("Complete")



#### Method 2: Term Frequency

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cluster_frequency_dict = {}

for i in range(num_clusters):

    sub_frame = frame.loc[frame['cluster'] == i]
    sub_data = sub_frame['sentence'].tolist()

    vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize_and_stem)
    count_matrix = vectorizer.fit_transform(sub_data)
    
    sum_words = count_matrix.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    top_n = 10

    words_top_n_freq = [word_freq for word_freq, key in words_freq][:top_n]
    cluster_frequency_dict[i] = words_top_n_freq
    
cluster_frequency_df = pd.DataFrame(cluster_frequency_dict)
print(cluster_frequency_df)

FILE_NAME = "output_count.csv"
print("Exporting to csv file: %s" %FILE_NAME)

cluster_frequency_df.to_csv(FILE_NAME)

print("Complete")

### Compute Restaurant : Top 100 words frequency
#### Get Top 100 words in all documents

In [None]:
vectorizer = CountVectorizer(stop_words='english',tokenizer=tokenize_and_stem, ngram_range=(1,3))
# print(allReviewText)
count_matrix = vectorizer.fit_transform(allReviewText)

terms = vectorizer.get_feature_names() #Get all the features / vocab i think

count_dataframe = pd.DataFrame(count_matrix.toarray(), columns = terms, index = allReviewBusinessId)
# print(count_dataframe)


# Get top n words
sub_data = frame['sentence'].tolist()

vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize_and_stem)
count_matrix = vectorizer.fit_transform(sub_data)

sum_words = count_matrix.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

top_n = 100

words_top_n_freq = [word_freq for word_freq, key in words_freq][:top_n]
print(words_top_n_freq)

# filter dataframe to show only those with n words

# count_dataframe[words_top_n_freq].groupby['index']

# FILE_NAME = "output_restaurant_to_topics.csv"
# print("Exporting to csv file: %s" %FILE_NAME)

# count_dataframe.to_csv(FILE_NAME)

print("Complete")



#### Group Restaurant Word Frequency Count

In [None]:
count_dataframe_grouped = count_dataframe[words_top_n_freq].groupby(count_dataframe.index).sum()

FILE_NAME = "output_restaurant_to_topics.csv"
print("Exporting to csv file: %s" %FILE_NAME)

count_dataframe_grouped.to_csv(FILE_NAME)

print("Complete")

## (B) Hierarchical Document Clustering

In [None]:
#import the consine similiarity
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)



#### Save dendrogram into file

In [None]:
#https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = pl.subplots(figsize=(15, 30)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=reviewSentenceList);

pl.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

pl.tight_layout() #show plot with tight layout
pl.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters