In [1]:
import csv
import nltk
import gensim

from operator import itemgetter

from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel

## Libraries

We will use two popular libraries to make our job much easier. [NLTK](http://www.nltk.org/) is a NLP Swiss army knife and [Gensim](https://radimrehurek.com/gensim/apiref.html) is more focused on topic modelling and vector models.


## Look at the data

In [2]:
# data location
DATA_FILE_NAME = 'E-restaurants-reviews.csv'

In [3]:
with open(DATA_FILE_NAME, newline='') as dataFile:
    dataTable = list(csv.reader(dataFile))

In [4]:
len(dataTable)

22666

In [5]:
dataTable[:2]

[['review_id', 'date', 'text', 'stars', 'type', 'business_id'],
 ['H7eJZ9azd1eH5minOhc-uw',
  '2008-07-06',
  "This Bar Restaurant is a wide open airy space within the Apex |International Hotel.We booked a table for Fathers Day Lunch. I have to say the food was excellent, the service was very good & value for money on the Sunay Lunch menu was excellent. The soup was piping hot, the Roast beef melted in the mouth! Didn't have room for puds but they looked fantastic",
  '5',
  'review',
  '-3pfhzz9CB7F2DpbF1Ko7Q']]

## Text preprocessing

First we need to split the text into individual words. Merely splitting by white space is not sufficient, but we leave the details to NLTK.

We also have to deal with capitalization (we don't want two different tags, _Restaurant_ and _restaurant_). For simplicity, we just lowercase all the text. True-caser or lemmatizer would be more appropriate, since _McDonald's_ looks better than _mcdonald's_ and _Apple_ means something else than _apple_.

In [6]:
# Install tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/eleanor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
texts = [nltk.word_tokenize(text.lower()) for text in map(itemgetter(2), dataTable)]

In [8]:
dataTable[3][2]

"Had dinner here in december last year, it was close to my friends apt and we just went there because we didn't want to go far.\nFood was nice, a bit pricey though, I had Orzo pasta, sun blushed tomato, rocket, parmesan & pine nuts, the pesto on it could've had less oil on it as I found quite greasy even for a pesto based sauce. It came with a nice salad so this was a good surprise. \nWe had a nice rose wine and service was alright."

In [9]:
texts[3][:10]

['had', 'dinner', 'here', 'in', 'december', 'last', 'year', ',', 'it', 'was']

## Create bag-of-words representation

Bag-of-words views text as a V-dimensional vector, where V is the size of vocabulary. Individual dimensions of vector can be set in various ways, one of the simplest is to use _v[i]_ = frequency of the word _i_ in the document. The corpus is then represented as a _DxV_ matrix where D is number of documents.

In gensim, we use `Dictionary` to assign IDs to words, these IDs are then used as vector indices.

In [10]:
dictionary = Dictionary(texts)

str(dictionary)

"Dictionary(51190 unique tokens: ['text', 'this', 'bar', 'restaurant', 'is']...)"

## Prune dictionary

Before we create the actual vectors, we can reduce dimensionality by removing non-informative words (prepositions etc.) and punctuation. Good heuristic is to remove words which are too common and words which are very rare (probably typos).

In [11]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stoplist = {'', ',', '.', '!', '?', ';', ':', '/', '\'', '\'s', '(', ')', '+', '-'}
stoplist.update(stopwords.words('english'))
stopIds = [tokId for tok, tokId in dictionary.token2id.items() if tok.lower() in stoplist]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eleanor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# remove stop words and very rare words
dictionary.filter_tokens(bad_ids=stopIds)
print(dictionary)
dictionary.filter_extremes(no_below=2, no_above=0.75, keep_n=None)
print(dictionary)

Dictionary(51035 unique tokens: ['text', 'bar', 'restaurant', 'wide', 'open']...)
Dictionary(24462 unique tokens: ['text', 'bar', 'restaurant', 'wide', 'open']...)


In [13]:
# Create bag-of-words representation
docsBow = [dictionary.doc2bow(doc) for doc in texts]

In [14]:
docsBow[1][:10]

[(1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1)]

In [15]:
dictionary[1716], dictionary[6805]

('quiffs', '..i')

## Compute TF-IDF

We will use Gensim to compute TF-IDF for us. It will return the result as a matrix with same dimensions, but with different values. _m\[d\]\[i\]_ = TFIDF of word _i_ in document _d_.

In [16]:
modelTfidf = TfidfModel(docsBow, normalize=False)

In [17]:
docsTfidf = modelTfidf[docsBow]

In [18]:
docsTfidf[3][:5]

[(15, 0.9449252800474218),
 (17, 1.8945950048668478),
 (18, 1.1321541082717443),
 (19, 3.888926254780156),
 (30, 1.160472160469467)]

## Get the tags

Finally we can retrieve the tags for given document by selecting the words with the highest TF-IDF score.

In [19]:
def getTags(doc, dictionary, count=5):
    return [dictionary[id] for id, score in sorted(doc, key=itemgetter(1), reverse=True)][:count]

In [20]:
' '.join(texts[1]), getTags(docsTfidf[1], dictionary, 8)

("this bar restaurant is a wide open airy space within the apex |international hotel.we booked a table for fathers day lunch . i have to say the food was excellent , the service was very good & value for money on the sunay lunch menu was excellent . the soup was piping hot , the roast beef melted in the mouth ! did n't have room for puds but they looked fantastic",
 ['fathers',
  'apex',
  'puds',
  'piping',
  'melted',
  'airy',
  'excellent',
  'within'])

In [21]:
' '.join(texts[3]), getTags(docsTfidf[3], dictionary, 8)

("had dinner here in december last year , it was close to my friends apt and we just went there because we did n't want to go far . food was nice , a bit pricey though , i had orzo pasta , sun blushed tomato , rocket , parmesan & pine nuts , the pesto on it could 've had less oil on it as i found quite greasy even for a pesto based sauce . it came with a nice salad so this was a good surprise . we had a nice rose wine and service was alright .",
 ['pesto', 'orzo', 'apt', 'blushed', 'pine', 'december', 'nuts', 'parmesan'])

## Add bigrams

Single-word tags are not quite enough. For some improvement, let's add bigrams which do not contain stopwords.

In [28]:
def addBigrams(words, stoplist):
    bigrams = ['_'.join(words[i:i + 2]) for i in range(len(words) - 1) 
               if words[i].lower() not in stoplist and words[i + 1].lower() not in stoplist]
    return words + bigrams

In [29]:
docsTxtBi = [addBigrams(d, stoplist) for d in texts]
dictionaryBi = Dictionary(docsTxtBi)
print(dictionaryBi)

Dictionary(307951 unique tokens: ['text', 'this', 'bar', 'restaurant', 'is']...)


In [30]:
stopIdsBi = [tokId for tok, tokId in dictionaryBi.token2id.items() if tok.lower() in stoplist]
dictionaryBi.filter_tokens(bad_ids=stopIdsBi)
print(dictionaryBi)
dictionaryBi.filter_extremes(no_below=3, no_above=0.75, keep_n=None)
print(dictionaryBi)

Dictionary(307796 unique tokens: ['text', 'bar', 'restaurant', 'wide', 'open']...)
Dictionary(49897 unique tokens: ['text', 'bar', 'restaurant', 'wide', 'open']...)


In [31]:
docsBowBi = [dictionaryBi.doc2bow(d) for d in docsTxtBi]
modelTfidfBi = TfidfModel(docsBowBi, normalize=False)
docsTfidfBi = modelTfidfBi[docsBowBi]

In [32]:
' '.join(texts[1]), getTags(docsTfidfBi[1], dictionaryBi, 10)

("this bar restaurant is a wide open airy space within the apex |international hotel.we booked a table for fathers day lunch . i have to say the food was excellent , the service was very good & value for money on the sunay lunch menu was excellent . the soup was piping hot , the roast beef melted in the mouth ! did n't have room for puds but they looked fantastic",
 ['day_lunch',
  'bar_restaurant',
  'apex',
  'airy_space',
  'good_&',
  'puds',
  'wide_open',
  'looked_fantastic',
  'roast_beef',
  'piping_hot'])

In [33]:
' '.join(texts[3]), getTags(docsTfidfBi[3], dictionaryBi, 10)

("had dinner here in december last year , it was close to my friends apt and we just went there because we did n't want to go far . food was nice , a bit pricey though , i had orzo pasta , sun blushed tomato , rocket , parmesan & pine nuts , the pesto on it could 've had less oil on it as i found quite greasy even for a pesto based sauce . it came with a nice salad so this was a good surprise . we had a nice rose wine and service was alright .",
 ['pesto',
  'blushed_tomato',
  'less_oil',
  'sun_blushed',
  'parmesan_&',
  'quite_greasy',
  'orzo',
  'found_quite',
  'nice_salad',
  'apt'])

## Test the 'pipeline' on previously unseen text

Transform the text to a vector in our TF-IDF space and extract the highest scoring uni/bigrams.

In [34]:
# Helper function to trasform plain text to a vector in our TF-IDF space
def makeDoc(text, dictionary, stoplist, model):
    tokens = nltk.word_tokenize(text.lower())
    bow = dictionary.doc2bow(addBigrams(tokens, stoplist))
    return model[bow]

In [35]:
makeDoc('I love cats. I make burgers out of them', dictionaryBi, stoplist, modelTfidfBi)

[(314, 3.1610413832193607),
 (436, 3.2551379727182637),
 (1374, 4.747143003652985),
 (9827, 10.298317190917858)]

In [36]:
dictionaryBi[19944]

'better_burgers'

In [37]:
# Previously unseen text
getTags(makeDoc('''Beautiful inside. Food was quick to come out. We had a good view of the bar from above and the gorgeous glass chandelier
made into the shape of a guitar. Cocktail prices only showed it including the free glass 
which ends up meaning a mocktail costs you £12. So make sure you check out the smaller drinks menu 
at your table which shows the prices with or without the souvenir hard rock cafe glass. ''', 
                dictionaryBi, stoplist, modelTfidfBi), dictionaryBi, 10)

['glass',
 'beautiful_inside',
 'cocktail_prices',
 'chandelier',
 'mocktail',
 'free_glass',
 'guitar',
 'souvenir',
 'good_view',
 'rock_cafe']