In [None]:
import pandas as pd
import numpy as np
import random


## Read CSV File

In [None]:
# Change this to the review file you want to read
id_name = 'The Prince of Wales, London.csv'

data = pd.read_csv(id_name)
filename = data['Restaurant_name'][0]
divide_set = 3 # proportion of test set = 1/divide_set



## If you want to use an existing classifier to read new review file, load the classifier here
## If you want to save a new classifier, chage save_classifier to True

In [None]:
classifier_id = ''
save_classifier = True

## Label Good and Bad reviews

In [None]:
rating = pd.to_numeric(data['review_rating'])
positive_reviews = data['review_content'][rating > 3] # 4* and 5* are classed as a good review
negative_reviews = data['review_content'][rating <= 3] # otherwise bad review
print('Positive: ', len(positive_reviews), 'Negative: ', len(negative_reviews))

# Import tokenizer style and 'Part of Speech' taggig package

In [None]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.tag import pos_tag

# Make positive/negative token lists

In [None]:
positive_tokens = list()
negative_tokens = list()

#tokenizer = RegexpTokenizer(r'\w+') # regular expression to take out the symbols

for pos_sentence  in positive_reviews:
    add_pos_sentence = word_tokenize(pos_sentence)
    positive_tokens.append(add_pos_sentence)
    
for neg_sentence in negative_reviews:
    add_neg_sentence = word_tokenize(neg_sentence)
    negative_tokens.append(add_neg_sentence)

# Checking the length of tokens still match with the no. of reviews

In [None]:
print('Positive Tokens: ', len(positive_tokens), 'Negative Tokens: ', len(negative_tokens))

# print some of Part-of-Speech in tokens

In [None]:
print(pos_tag(positive_tokens[0]))

### Import stemming or lemmatizer, Stopwords, regular expression and special strings

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re, string
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
add_words = ["...", "'"]
stop_words = stop_words + add_words

## Examples of lemmatizer and stemmer
``` python
lemmitizer = WordNetLemmatizer()
stemmer = PorterStemmer()

words = ['good', 'best', 'excellent', 'better', 'swim', 'swam']
for w in words:
    print(w, ':', lemmitizer.lemmatize(w,'a'))
    print(w, ':', stemmer.stem(w))
    print('--------')
```

In [None]:
def cleaned_words (tokens, stop_words): # lemmatize sentence, omit punctuation and stop words such as preposition
    cleaned_tokens = []
    for token, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else: # all the rest tagged with a
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        word = lemmatizer.lemmatize(token, pos)
        
        if len(word) > 0 and word not in string.punctuation and word.lower() not in stop_words:
            cleaned_tokens.append(word.lower())
    return cleaned_tokens

In [None]:
positive_cleaned_tokens_list=list()
negative_cleaned_tokens_list=list()
for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

In [None]:
print(negative_cleaned_tokens_list[:10])

# Extracting the most frequent words for sentiment analyses

In [None]:
# Get word frequecy

def get_all_words(cleaned_tokens_list): 
    all_words = []
    for tokens in cleaned_tokens_list:
        for token in tokens:
            all_words.append(token)
    return all_words

all_neg_words = get_all_words(negative_cleaned_tokens_list)
all_pos_words = get_all_words(positive_cleaned_tokens_list)

from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
main_pos_words = freq_dist_pos.most_common(30)
main_neg_words = freq_dist_neg.most_common(30)

print(freq_dist_pos.most_common(30))
print(freq_dist_neg.most_common(30))

In [None]:
def get_dict_for_model(cleaned_tokens_list):
    for list_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in list_tokens)

positive_tokens_for_model = get_dict_for_model(positive_cleaned_tokens_list) 
negative_tokens_for_model = get_dict_for_model(negative_cleaned_tokens_list) 

```python
for neg_dict in negative_tokens_for_model:  
    print(neg_dict)
```

In [None]:
pos_dataset = [(dict_word, "Positive") 
               for dict_word in positive_tokens_for_model]
neg_dataset = [(dict_word, "Negative") 
               for dict_word in negative_tokens_for_model]

random.shuffle(neg_dataset)
random.shuffle(pos_dataset)

positive_dataset = pos_dataset[0:round(len(pos_dataset)*(1-1/divide_set))]
negative_dataset = neg_dataset[0:round(len(neg_dataset)*(1-1/divide_set))]

test_set = pos_dataset[round(len(pos_dataset)*(1-1/divide_set)): len(pos_dataset)] + neg_dataset[round(len(neg_dataset)*(1-1/divide_set)): len(neg_dataset)]


In [None]:
print('Positive Training Data:', len(positive_dataset), 'Negative Training Data:', len(negative_dataset),
     'Test_dataset:', len(test_set))

In [None]:
positive_train = []
negative_train = []
training_set = []

positive_train = positive_dataset
negative_train = negative_dataset
training_set = positive_train + negative_train
random.shuffle(training_set)
random.shuffle(test_set)

## Alternatively this code trains equal number of positive sentiment and negative sentiment by sampling with replacement

```python

training_no = 5000
index1 = np.random.choice(len(positive_dataset), training_no)
index2 = np.random.choice(len(negative_dataset), training_no)

positive_train =[positive_dataset[i] for i in index1]
negative_train =[negative_dataset[i] for i in index2]
training_set = positive_train + negative_train
random.shuffle(training_set)
```

In [None]:
def import_classifier(classifier):
    import pickle
    f = open(classifier, 'rb')
    classifier = pickle.load(f)
    return classifier
    #f.close()

In [None]:
from nltk import NaiveBayesClassifier
from nltk import classify
classifier = NaiveBayesClassifier.train(training_set) if classifier_id =='' else import_classifier(classifier_id)
print(classifier.labels())
print("Accuracy is:", classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(20))

### Confusion Matrix

In [None]:
from nltk.metrics import ConfusionMatrix
test_tag = [tag[0] for tag in test_set]
test_label = [sent[1] for sent in test_set]
model_label = classifier.classify_many(test_tag)
cm = ConfusionMatrix(test_label, model_label)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

## Save the classifier

In [None]:
if save_classifier == True:
    import pickle
    seperator = ''
    filename = seperator.join([filename, '.pickle'])
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close()

### just randomly exploring how to use ngram. irrelevant to the exercise

```python
from nltk import ngrams

sentence = 'this is a foo bar sentences and i want to ngramize it'

n = 6
sixgrams = ngrams(sentence.split(), n)

for grams in sixgrams:
    print(grams)
    
```