In [1]:
import numpy as np
import pandas as pd

#Import the data file
data = pd.read_csv("Review.csv")

#Preview the data
data.head()

Unnamed: 0,BusinessId,CoolCount,FunnyCount,NotRecommended,Rating,Text,UsefulCount,UserId,_id
0,long-beach-seafood-singapore,1,0,False,5,Been coming here for more than a decade. Long ...,1,mMjxhRn4h0LD1_jI3RT4cQ,ENx8ZXpulmX5_AfCOp_U3A
1,long-beach-seafood-singapore,3,0,False,5,I've been coming here with my boyfriend to onl...,4,tAr3zFVXoM1K2PrbyCdcTA,Tjd32RIUDeOJrYpJ9J6_GA
2,long-beach-seafood-singapore,2,1,False,5,MUST GO! Your life isn't complete without goin...,1,1-5YghuD_7sLuD01f6hrJw,wYuRHgUSBTm6D8B-HrSpNg
3,long-beach-seafood-singapore,0,0,False,3,"Excellent service, our waiter was always aroun...",0,ZAaT3T_Yd1re0Nt_MOW7AA,lisryzBzdpf4KNXfx6-3Fg
4,long-beach-seafood-singapore,0,0,False,4,Try the Chilli crab. Its gravy was on the swee...,1,oRMf6hs2lgU3bIr_fF_5IA,thceOS2dozAixh8Qu09gOg


In [2]:
#Select only those important columns for our project
df1 = data[['_id','UserId','BusinessId','Rating','Text']]
print(df1)


                          _id                  UserId  \
0      ENx8ZXpulmX5_AfCOp_U3A  mMjxhRn4h0LD1_jI3RT4cQ   
1      Tjd32RIUDeOJrYpJ9J6_GA  tAr3zFVXoM1K2PrbyCdcTA   
2      wYuRHgUSBTm6D8B-HrSpNg  1-5YghuD_7sLuD01f6hrJw   
3      lisryzBzdpf4KNXfx6-3Fg  ZAaT3T_Yd1re0Nt_MOW7AA   
4      thceOS2dozAixh8Qu09gOg  oRMf6hs2lgU3bIr_fF_5IA   
5      7yAZ_47K_aJQlWGM79mYCA  oeAhRa8yFa9jtrhaHnOyxQ   
6      NtBCMPJH_n62YA9Zy83jlw  8aBKh52ePGd3OG8di20wdw   
7      5uOt1hm1LxR-H2NJEs1Vgw  pMltyeuU4SjT6Y8YVy7HBA   
8      qWZ0yoP48ZLoGvj9QVc4TA  fZaWWxwwHGtH-8Vhx7qE_Q   
9      MuIvIPQiYBXJDhicWjSX_g  m7rbZtL4b8du326Ng43SiA   
10     EphGs01wv824ChsIm-bNxA  Q1oMmm7tKPOzA7_gqcV4zg   
11     sVMv2XaWvEn3gGi8dnO3lA  Vtdm-QpN5yYxT-O00cuqFg   
12     Z79DT0ECtm-1YDXFE9fEzA  wwdamcY73iJTyhl-3gGOJw   
13     LeFrcXzAtIG26HKtYdpD6w  11R2R4nyRpF6nqlXH-JeGQ   
14     i57jsdlQtxiSYWovohElAg  ITWi6Z_nZ8mmiSzpSUrDKg   
15     CQ1njdQNZV-Z06VwDeC_5w  3S53VaMDPy8Lywk2TuPb4A   
16     mbIUMsDU3CjAcq9bgwfwrA  

In [3]:
#Create a class for a review 
class Review:
  def __init__(self, reviewId, userId, businessId, rating, text):
    self.reviewId = reviewId
    self.userId = userId
    self.businessId = businessId
    self.rating = rating
    self.text = text
#Do we need a review object? Will it make it easier?

In [4]:
#Logic to Follow
#1. From Pandas dataframe, convert to a corpus of reviews
#2. For each review, split into sentences
#3. Do preprocessing on each of the sentences? 
#    -convert to lowercase
#    -remove weird symbols (Regrex patterns)
#    -stopwords
#    -stemming / lemmatization
#4. Convert each sentences into a vector model
#5. Cluster the sentences (Kmeans VS Hierarchial)
#6. Categorize the clusters (Price/Quality/Ambience etc)
#7. Publish results via Dashboard(???)

# The following statement imports the NLTK package.
#import nltk
# The following statement imports a class called .
#from nltk.corpus import sent_tokenize

In [6]:
# Make a directory to store the corpus files
# Creating individual review files 
# Need to delete Data folder before adding the files again**
import os
import shutil
import timeit

def createFolder(directory):
    try:
        if os.path.exists(directory):
            shutil.rmtree(directory)
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
        

#Create a folder as a corpus
# Creates a folder in the current directory called Data
createFolder('./Data/')


def CreateCorpusFromDataFrame(corpusfolder,df1):
    for index, r in df1.iterrows():
        reviewId = r['_id']
        businessId = r['BusinessId']
        userId = r['UserId']
        rating = r['Rating']
        body = r['Text']
        fname=str(businessId) + '_' + str(rating) + '_' + str(reviewId)+'.txt'
        corpusfile=open(corpusfolder+'/'+fname,'a')
        corpusfile.write(str(body))
        corpusfile.close()
        
start_time = timeit.default_timer()
CreateCorpusFromDataFrame('./Data',df1)
elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

Time Taken: 7.352071699000135


In [7]:
#Prepare the corpus
#This code will take a long while to run...
import timeit
start_time = timeit.default_timer()

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
from preprocess import *
#Load all the files from our Data folder
corpus = load_corpus('Data')

#This is the docs returned after stop_words/stemming etc..
docs = corpus2docs(corpus)

#This is the most important vocabulary/dictionary which gives us a list of all the 'unique' words
dictionary = gensim.corpora.Dictionary(docs)

#This is an array list of each tfidf[vec] (vector docment against tfidf)
vecs = docs2vecs(docs, dictionary)
print(len(docs))

elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

11356
Time Taken: 76.9327795170002


In [7]:
#Perform K-means clustering 
# Need to know how many k clusters here (???)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import k_means
start_time = timeit.default_timer()

num_tokens = len(dictionary.token2id)
clusters = k_means.k_means(vecs, num_tokens, 2)

elapsed = timeit.default_timer() - start_time
print("Time Taken: "+ str(elapsed))

Time Taken: 56.82802796499891


In [8]:
#Testing Kmeans Clustering

fids = corpus.fileids()

#The below prints the file ids in each cluster

cluster1 = clusters[0]
print("Cluster 1:", [fids[d] for d in cluster1])

cluster2 = clusters[1]
print("Cluster 2:", [fids[d] for d in cluster2])

Cluster 1: ['%E4%BE%86%E5%A7%90%E5%AE%B6-singapore_5_1WvbTEh-Br-VbjS7k2GEeQ.txt', '%E5%90%83-western-singapore_4_7tJLnJLJQIzMEz1UxkR44g.txt', '%E5%9D%97%E4%B8%89%E7%82%B9%E5%BF%83-1-30-dim-sum-singapore_3_Zx0O0ZeXsyG3luSDT8C97Q.txt', '%E6%97%BA%E8%A7%92%E9%BB%9E%E5%BF%83-mongkok-dim-sum-singapore_4_PPUYMVzfMKcTkcEtjCh5_w.txt', '%E6%97%BA%E8%A7%92%E9%BB%9E%E5%BF%83-mongkok-dim-sum-singapore_4_aLBTHnw5-TcwhDL3880M6g.txt', '%E6%97%BA%E8%A7%92%E9%BB%9E%E5%BF%83-mongkok-dim-sum-singapore_5_pxKh2T9u9luSEOg22Z-K9A.txt', '10-scotts-singapore_4_ETQM2MmSWPO_fJjWRMuBlw.txt', '10-scotts-singapore_4_vyzP3NVqB7uBqzrgBaUT6w.txt', '101-seafood-gourmet-singapore_4_wDpsXnU4MSRI9Ku662YPpg.txt', '1036-live-seafood-singapore_3_Vu-A8Q-0w5W4PsuWeTyR-g.txt', '1036-live-seafood-singapore_4_Cz3Jo0dpgJBKK9sP3_OFWw.txt', '109-teochew-yong-tau-foo-singapore-3_4_uZXSg_hePmXy-gMw6kc_0g.txt', '116-roasted-meat-noodles-singapore-2_4_HciTiESotKja7mYCjUnnCw.txt', '128-fish-porridge-singapore_4_qdQmgRV3v2twO8sL4g8wTQ.txt

In [13]:
#Testing out other python library with kmeans and viz
#https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

import pylab as pl

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

from gensim.test.utils import common_texts
from gensim.sklearn_api import W2VTransformer

#Technically we already have the tfidf 
tfidf = models.TfidfModel(corpus)

Nc = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in Nc]
kmeans
score = [kmeans[i].fit(tfidf[vecs]).score(tfidf[vecs]) for i in range(len(kmeans))]
score
pl.plot(Nc,score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

NameError: name 'tfidf' is not defined