# Read the CSV files

In [1]:
import pandas as pd
import numpy as np
import random
#training_no = 5000 # iterative training randomly selected from training set
data = pd.read_csv('The Golden Chippy.csv')
filename = data['Restaurant_name'][0]
divide_set = 3 # proportion of test set = 1/divide_set
classifier_id = ''
save_classifier = True

# Label Good and Bad reviews

In [2]:
rating = pd.to_numeric(data['review_rating'])
positive_reviews = data['review_title'][rating > 3]
negative_reviews = data['review_title'][rating <= 3]
print('Positive: ', len(positive_reviews), 'Negative: ', len(negative_reviews))

Positive:  2600 Negative:  55


# Import tokenizer style and 'Part of Word' taggig package

In [3]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.tag import pos_tag

# Make positive/negative token lists

In [4]:
positive_tokens = list()
negative_tokens = list()
#tokenizer = RegexpTokenizer(r'\w+') # regular expression to take out the symbols

for pos_sentence  in positive_reviews:
    add_pos_sentence = word_tokenize(pos_sentence)
    positive_tokens.append(add_pos_sentence)
    
for neg_sentence in negative_reviews:
    add_neg_sentence = word_tokenize(neg_sentence)
    negative_tokens.append(add_neg_sentence)

# Checking the length of tokens still match with the no. of reviews

In [5]:
print('Positive Tokens: ', len(positive_tokens), 'Negative Tokens: ', len(negative_tokens))

Positive Tokens:  2600 Negative Tokens:  55


# Part-of-Speech in tokens

print(pos_tag(positive_tokens[0]))

# Import stemming or lemmatizer
# Import Stopwords
# import regular expression and special strings

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re, string
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [7]:
lemmitizer = WordNetLemmatizer()
stemmer = PorterStemmer()

words = ['good', 'best', 'excellent', 'better', 'swim', 'swam']
for w in words:
    print(w, ':', lemmitizer.lemmatize(w, 'a'))
    print(w, ':', stemmer.stem(w))
    print('--------')

good : good
good : good
--------
best : best
best : best
--------
excellent : excellent
excellent : excel
--------
better : good
better : better
--------
swim : swim
swim : swim
--------
swam : swam
swam : swam
--------


In [8]:
def cleaned_words (tokens, stop_words): # lemmatize sentence, omit punctuation and stop words such as preposition
    cleaned_tokens = []
    for token, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else: # all the rest tagged with a
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        word = lemmatizer.lemmatize(token, pos)
        
        if len(word) > 0 and word not in string.punctuation and word.lower() not in stop_words:
            cleaned_tokens.append(word.lower())
    return cleaned_tokens

In [9]:
positive_cleaned_tokens_list=list()
negative_cleaned_tokens_list=list()
for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

# Extracting the most frequent words for sentiment analyses (or all words) 

In [25]:
# Get word frequecy

def get_all_words(cleaned_tokens_list): 
    all_words = []
    for tokens in cleaned_tokens_list:
        for token in tokens:
            all_words.append(token)
    return all_words

all_neg_words = get_all_words(negative_cleaned_tokens_list)
all_pos_words = get_all_words(positive_cleaned_tokens_list)

from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
main_pos_words = freq_dist_pos.most_common(30)
main_neg_words = freq_dist_neg.most_common(30)

#freq_pos_words = [main_pos_words[i][0] for i in range(0, len(main_pos_words))]
#freq_neg_words = [main_neg_words[i][0] for i in range(0, len(main_neg_words))]

print(freq_dist_pos.most_common(30))
print(freq_dist_neg.most_common(30))

['fish', 'best', 'chip', 'chips', 'london', 'fresh', 'food', 'good', 'chippy', 'great', 'ever', 'amazing', '...', 'shop', 'n', 'tasty', 'one', 'life', 'birthday', 'cook', 'south', 'thank', 'amzing', 'halal', 'chris', 'variety', 'adam', 'catalina', 'greenwich/stamford', 'ct']
['fish', 'chip', 'food', 'average', 'good', 'staff', 'bad', 'friendly', 'ok', 'chippy', 'nothing', 'review', 'rude', 'rat', 'star', '...', 'london', 'disappointed', 'great', 'fresh', 'special', 'nice', 'non', 'chicken', 'cooked', 'dry', 'many', 'shin', 'quite', 'spot']


In [10]:
def get_dict_for_model(cleaned_tokens_list):
    for list_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in list_tokens)

positive_tokens_for_model = get_dict_for_model(positive_cleaned_tokens_list) 
negative_tokens_for_model = get_dict_for_model(negative_cleaned_tokens_list) 

for neg_dict in negative_tokens_for_model:  
    print(neg_dict)

In [11]:
pos_dataset = [(dict_word, "Positive") 
               for dict_word in positive_tokens_for_model]
neg_dataset = [(dict_word, "Negative") 
               for dict_word in negative_tokens_for_model]
random.shuffle(neg_dataset)
random.shuffle(pos_dataset)
all_data = neg_dataset + pos_dataset
positive_dataset = pos_dataset[0:round(len(pos_dataset)*(1-1/divide_set))]
negative_dataset = neg_dataset[0:round(len(neg_dataset)*(1-1/divide_set))]
test_dataset = pos_dataset[round(len(pos_dataset)*(1-1/divide_set)): len(pos_dataset)] + neg_dataset[round(len(neg_dataset)*(1-1/divide_set)): len(neg_dataset)]


In [12]:
print('Positive Training Data: ', len(positive_dataset), 'Negative Training Data: ', len(negative_dataset),
     'Test_dataset: ', len(test_dataset))

Positive Training Data:  1733 Negative Training Data:  37 Test_dataset:  885


In [13]:
positive_train=[]
negative_train=[]
training_set = []

#index1 = np.random.choice(len(positive_dataset), training_no)
#index2 = np.random.choice(len(negative_dataset), training_no)

positive_train = positive_dataset#[positive_dataset[i] for i in index1]
negative_train = negative_dataset#[negative_dataset[i] for i in index2]
training_set = positive_train + negative_train
random.shuffle(training_set)


In [14]:
def import_classifier(classifier):
    import pickle
    f = open(classifier, 'rb')
    classifier = pickle.load(f)
    return classifier
    #f.close()

In [15]:
from nltk import NaiveBayesClassifier
from nltk import classify
classifier = NaiveBayesClassifier.train(training_set) if classifier_id =='' else import_classifier(classifier_id)

print("Accuracy is:", classify.accuracy(classifier, test_dataset))
print(classifier.show_most_informative_features(20))

Accuracy is: 0.9254237288135593
Most Informative Features
                 average = True           Negati : Positi =    136.9 : 1.0
                     bag = True           Negati : Positi =     45.6 : 1.0
                    term = True           Negati : Positi =     45.6 : 1.0
                     bit = True           Negati : Positi =     45.6 : 1.0
                    rest = True           Negati : Positi =     45.6 : 1.0
                    many = True           Negati : Positi =     45.6 : 1.0
                    know = True           Negati : Positi =     45.6 : 1.0
                    deal = True           Negati : Positi =     45.6 : 1.0
                 nothing = True           Negati : Positi =     45.6 : 1.0
                  highly = True           Negati : Positi =     45.6 : 1.0
            disappointed = True           Negati : Positi =     45.6 : 1.0
                    long = True           Negati : Positi =     45.6 : 1.0
                    time = True           

In [27]:
NaiveBayesClassifier.prob_classify(training_set)

TypeError: prob_classify() missing 1 required positional argument: 'featureset'

In [16]:
if save_classifier == True:
    import pickle
    seperator = ''
    filename = seperator.join([filename, '.pickle'])
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close()

from nltk import ngrams

sentence = 'this is a foo bar sentences and i want to ngramize it'

n = 6
sixgrams = ngrams(sentence.split(), n)

for grams in sixgrams:
    print(grams)