[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]

In [1]:
import pandas as pd
import numpy as np
import random


## Read CSV File

In [2]:
# Change this to the review file you want to read
id_name = 'The Prince of Wales, London.csv'

data = pd.read_csv(id_name)
filename = data['Restaurant_name'][0]
divide_set = 3 # proportion of test set = 1/divide_set



## If you want to use an existing classifier to read new review file, load the classifier here
## If you want to save a new classifier, chage save_classifier to True

In [3]:
classifier_id = ''
save_classifier = True

## Label Good and Bad reviews

In [4]:
rating = pd.to_numeric(data['review_rating'])
positive_reviews = data['review_content'][rating > 3] # 4* and 5* are classed as a good review
negative_reviews = data['review_content'][rating <= 3] # otherwise bad review
print('Positive: ', len(positive_reviews), 'Negative: ', len(negative_reviews))

Positive:  2283 Negative:  22


# Import tokenizer style and 'Part of Speech' taggig package

In [22]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.tag import pos_tag
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/annapeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/annapeng/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annapeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annapeng/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Make positive/negative token lists

In [12]:
positive_tokens = list()
negative_tokens = list()

#tokenizer = RegexpTokenizer(r'\w+') # regular expression to take out the symbols

for pos_sentence  in positive_reviews:
    add_pos_sentence = word_tokenize(pos_sentence)
    positive_tokens.append(add_pos_sentence)
    
for neg_sentence in negative_reviews:
    add_neg_sentence = word_tokenize(neg_sentence)
    negative_tokens.append(add_neg_sentence)

# Checking the length of tokens still match with the no. of reviews

In [13]:
print('Positive Tokens: ', len(positive_tokens), 'Negative Tokens: ', len(negative_tokens))

Positive Tokens:  2283 Negative Tokens:  22


# print some of Part-of-Speech in tokens

In [16]:
print(pos_tag(positive_tokens[0]))

[('Friendly', 'JJ'), ('staff', 'NN'), (',', ','), ('great', 'JJ'), ('atmosphere', 'NN'), ('and', 'CC'), ('even', 'RB'), ('better', 'JJR'), ('food', 'NN'), ('and', 'CC'), ('drink', 'NN'), ('.', '.'), ('Massive', 'JJ'), ('Sunday', 'NNP'), ('roast', 'NN'), ('for', 'IN'), ('£12', 'NNP'), ('-', ':'), ('bargain', 'NN'), ('.', '.')]


### Import stemming or lemmatizer, Stopwords, regular expression and special strings

In [19]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re, string
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
add_words = ["...", "'"]
stop_words = stop_words + add_words

## Examples of lemmatizer and stemmer
``` python
lemmitizer = WordNetLemmatizer()
stemmer = PorterStemmer()

words = ['good', 'best', 'excellent', 'better', 'swim', 'swam']
for w in words:
    print(w, ':', lemmitizer.lemmatize(w,'a'))
    print(w, ':', stemmer.stem(w))
    print('--------')
```

In [20]:
def cleaned_words (tokens, stop_words): # lemmatize sentence, omit punctuation and stop words such as preposition
    cleaned_tokens = []
    for token, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else: # all the rest tagged with a
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        word = lemmatizer.lemmatize(token, pos)
        
        if len(word) > 0 and word not in string.punctuation and word.lower() not in stop_words:
            cleaned_tokens.append(word.lower())
    return cleaned_tokens

In [23]:
positive_cleaned_tokens_list=list()
negative_cleaned_tokens_list=list()
for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

In [24]:
print(negative_cleaned_tokens_list[:10])

[['food', 'pretty', 'good', 'beer', 'awful', 'beer', 'taste', 'like', 'watered-down', 'even', "n't", 'manage', 'drink', 'pint'], ['walk', 'place', 'way', 'imperial', 'war', 'museum', 'look', 'good', 'outside', 'lot', 'food', 'tout', 'decent', 'look', 'menu', 'way', 'back', 'elephant', 'castle', 'tube', 'think'], ['food', 'pretty', 'good', 'beer', 'awful', 'beer', 'taste', 'like', 'watered-down', 'even', "n't", 'manage', 'drink', 'pint'], ['walk', 'place', 'way', 'imperial', 'war', 'museum', 'look', 'good', 'outside', 'lot', 'food', 'tout', 'decent', 'look', 'menu', 'way', 'back', 'elephant', 'castle', 'tube', 'think'], ['think', 'go', 'great', 'night', 'read', 'review', '’', 'lively', 'expect', 'sunday', '’', 'big', 'deal', 'scouted', 'outside', 'free', 'shot', 'advertise', 'staff'], ['food', 'good', 'place', 'homey', 'service', 'bit', 'spotty', 'great', 'want', 'drink', 'read', 'book'], ['food', 'good', 'price', 'high', 'london', 'lack', 'service', 'waited', 'long', 'someone', 'come',

# Extracting the most frequent words for sentiment analyses

In [25]:
# Get word frequecy

def get_all_words(cleaned_tokens_list): 
    all_words = []
    for tokens in cleaned_tokens_list:
        for token in tokens:
            all_words.append(token)
    return all_words

all_neg_words = get_all_words(negative_cleaned_tokens_list)
all_pos_words = get_all_words(positive_cleaned_tokens_list)

from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
main_pos_words = freq_dist_pos.most_common(30)
main_neg_words = freq_dist_neg.most_common(30)

print(freq_dist_pos.most_common(30))
print(freq_dist_neg.most_common(30))

[('food', 1851), ('good', 1519), ('great', 1257), ('pub', 1133), ('friendly', 1085), ('service', 807), ('place', 790), ('staff', 740), ('roast', 549), ('best', 539), ('dinner', 515), ('london', 440), ('atmosphere', 439), ('fish', 426), ('prince', 423), ('chip', 420), ('lunch', 419), ('wales', 412), ('recommend', 408), ('local', 399), ('drink', 391), ('family', 388), ('sunday', 387), ('well', 381), ('pie', 350), ('also', 350), ('amaze', 341), ('greet', 327), ('visit', 310), ('lovely', 270)]
[('food', 19), ('good', 8), ('place', 8), ('museum', 7), ('go', 7), ('us', 7), ('drink', 6), ('look', 6), ('pub', 6), ("n't", 5), ('way', 5), ('sunday', 5), ('come', 5), ('beer', 4), ('like', 4), ('imperial', 4), ('war', 4), ('’', 4), ('order', 4), ('roast', 4), ('beef', 4), ('awful', 3), ('taste', 3), ('even', 3), ('outside', 3), ('back', 3), ('think', 3), ('great', 3), ('staff', 3), ('want', 3)]


In [26]:
def get_dict_for_model(cleaned_tokens_list):
    for list_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in list_tokens)

positive_tokens_for_model = get_dict_for_model(positive_cleaned_tokens_list) 
negative_tokens_for_model = get_dict_for_model(negative_cleaned_tokens_list) 

```python
for neg_dict in negative_tokens_for_model:  
    print(neg_dict)
```

In [27]:
pos_dataset = [(dict_word, "Positive") 
               for dict_word in positive_tokens_for_model]
neg_dataset = [(dict_word, "Negative") 
               for dict_word in negative_tokens_for_model]

random.shuffle(neg_dataset)
random.shuffle(pos_dataset)

positive_dataset = pos_dataset[0:round(len(pos_dataset)*(1-1/divide_set))]
negative_dataset = neg_dataset[0:round(len(neg_dataset)*(1-1/divide_set))]

test_set = pos_dataset[round(len(pos_dataset)*(1-1/divide_set)): len(pos_dataset)] + neg_dataset[round(len(neg_dataset)*(1-1/divide_set)): len(neg_dataset)]


In [28]:
print('Positive Training Data:', len(positive_dataset), 'Negative Training Data:', len(negative_dataset),
     'Test_dataset:', len(test_set))

Positive Training Data: 1522 Negative Training Data: 15 Test_dataset: 768


In [29]:
positive_train = []
negative_train = []
training_set = []

positive_train = positive_dataset
negative_train = negative_dataset
training_set = positive_train + negative_train
random.shuffle(training_set)
random.shuffle(test_set)

## Alternatively this code trains equal number of positive sentiment and negative sentiment by sampling with replacement

```python

training_no = 5000
index1 = np.random.choice(len(positive_dataset), training_no)
index2 = np.random.choice(len(negative_dataset), training_no)

positive_train =[positive_dataset[i] for i in index1]
negative_train =[negative_dataset[i] for i in index2]
training_set = positive_train + negative_train
random.shuffle(training_set)
```

In [30]:
def import_classifier(classifier):
    import pickle
    f = open(classifier, 'rb')
    classifier = pickle.load(f)
    return classifier
    #f.close()

In [31]:
from nltk import NaiveBayesClassifier
from nltk import classify
classifier = NaiveBayesClassifier.train(training_set) if classifier_id =='' else import_classifier(classifier_id)
print(classifier.labels())
print("Accuracy is:", classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(20))

['Positive', 'Negative']
Accuracy is: 0.75390625
Most Informative Features
                 thought = True           Negati : Positi =     95.2 : 1.0
                    onto = True           Negati : Positi =     95.2 : 1.0
                purchase = True           Negati : Positi =     95.2 : 1.0
                 kitchen = True           Negati : Positi =     95.2 : 1.0
                   gravy = True           Negati : Positi =     95.2 : 1.0
                 explore = True           Negati : Positi =     95.2 : 1.0
               elsewhere = True           Negati : Positi =     95.2 : 1.0
                     due = True           Negati : Positi =     95.2 : 1.0
                   homey = True           Negati : Positi =     95.2 : 1.0
           unfortunately = True           Negati : Positi =     95.2 : 1.0
               direction = True           Negati : Positi =     95.2 : 1.0
                thursday = True           Negati : Positi =     95.2 : 1.0
                 someone 

### Confusion Matrix

In [32]:
from nltk.metrics import ConfusionMatrix
test_tag = [tag[0] for tag in test_set]
test_label = [sent[1] for sent in test_set]
model_label = classifier.classify_many(test_tag)
cm = ConfusionMatrix(test_label, model_label)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

         |      P      N |
         |      o      e |
         |      s      g |
         |      i      a |
         |      t      t |
         |      i      i |
         |      v      v |
         |      e      e |
---------+---------------+
Positive | <74.5%> 24.6% |
Negative |      .  <0.9%>|
---------+---------------+
(row = reference; col = test)



## Save the classifier

In [33]:
if save_classifier == True:
    import pickle
    seperator = ''
    filename = seperator.join([filename, '.pickle'])
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close()

### just randomly exploring how to use ngram. irrelevant to the exercise

```python
from nltk import ngrams

sentence = 'this is a foo bar sentences and i want to ngramize it'

n = 6
sixgrams = ngrams(sentence.split(), n)

for grams in sixgrams:
    print(grams)
    
```