## Importing necessary libraries

In [25]:
import pandas as pd
import re
import inflect
import numpy as np
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Read data into pandas DataFrames
reviews = pd.read_csv('./train_labelled.csv')
# reviews = pd.read_csv('./train_cleaned.csv')

In [3]:
reviews.isnull().any()

Restaurant    False
Review        False
Label         False
Stars         False
Date          False
dtype: bool

In [4]:
reviews.shape

(1926, 5)

In [5]:
reviews.head()

Unnamed: 0,Restaurant,Review,Label,Stars,Date
0,Happy Tummy,"Fresh ingredients, friendly peeps and so much ...",1,5,2016-04-06T00:00:00
1,Cibo Italiano,A small selection of Italian wines by the glas...,1,4,2015-12-24T00:00:00
2,Yan kee Noodle House,The plus point is that the price remains the s...,1,4,2018-12-28T00:00:00
3,Clinton Street Baking Company & Restaurant,Same for more?I ordered what I thought was the...,1,2,2018-03-03T00:00:00
4,Song Fa Bak Kut Teh,I will probably get that again.Have been very ...,5,5,2019-01-05T00:00:00


## Data Preprocessing

In [6]:
# Cleaning the Data
def clean(data):

    # Removing leading and trailing white spaces
    data = str(data).strip()
    
    # Converting all text to lower case
    data = data.lower() 
    
    # add space for punctuation
    translator = re.compile('[%s]' % re.escape(string.punctuation))
    translator.sub(' ', data)

    # replace $ sign with 'dollar'
    data = data.replace('$', 'dollar ')
    #remove punct
    data = re.sub('[^A-Za-z0-9]+', ' ', data)
    
    #keep english words
    data = re.sub(r'[^\x00-\x7F]+', '', data)
    
    #keep printable
    data = re.sub(f'[^{re.escape(string.printable)}]', '', data)

    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(data)

    # Converting all text to base form
    lemmatizer = WordNetLemmatizer()
    tokens_lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
            
    # Removing Punctuation
#     translator = str.maketrans('', '', string.punctuation)
#     data = data.translate(translator)

         # Converting all numbers to words
#    word = data.split(' ')
#     p = inflect.engine()
#     word = ' '.join([i for i in word if not i.isdigit()])
#     word = re.sub(r'\d+', p.number_to_words(word) , word)    

    word = ' '.join([i for i in tokens_lemmatized])

    return word

# clean text data
reviews["Review_clean"] = reviews["Review"].apply(lambda x: clean(x))
reviews

Unnamed: 0,Restaurant,Review,Label,Stars,Date,Review_clean
0,Happy Tummy,"Fresh ingredients, friendly peeps and so much ...",1,5,2016-04-06T00:00:00,fresh ingredient friendly peep and so much che...
1,Cibo Italiano,A small selection of Italian wines by the glas...,1,4,2015-12-24T00:00:00,a small selection of italian wine by the glass...
2,Yan kee Noodle House,The plus point is that the price remains the s...,1,4,2018-12-28T00:00:00,the plus point is that the price remains the s...
3,Clinton Street Baking Company & Restaurant,Same for more?I ordered what I thought was the...,1,2,2018-03-03T00:00:00,same for more i ordered what i thought wa thei...
4,Song Fa Bak Kut Teh,I will probably get that again.Have been very ...,5,5,2019-01-05T00:00:00,i will probably get that again have been very ...
5,Brawn & Brains Coffee,I don't get why it can't just be full service....,1,3,2019-12-17T00:00:00,i don t get why it can t just be full service ...
6,No. 18 Zion Road Char Kway Teow,I stood in line patiently and got a large plat...,1,3,2016-11-21T00:00:00,i stood in line patiently and got a large plat...
7,Lawry's The Prime Rib Singapore,Price is on the high side.,1,4,2013-08-13T00:00:00,price is on the high side
8,Old Airport Road Food Centre,And yay for mantous.. 50cents each.,1,4,2013-03-05T00:00:00,and yay for mantous 50cents each
9,Dolce Vita,Good things never come cheap and I have to agr...,1,4,2012-08-23T00:00:00,good thing never come cheap and i have to agre...


In [7]:
# To get unique restaurant names
restaurant_names = []
for rest in reviews["Restaurant"]: # each restaurant name
    if rest not in restaurant_names: 
        restaurant_names.append(rest)

print(len(restaurant_names)) # number of restaurants

501


In [34]:
# To get restaurant name and respective reviews (separately including date)
reviewsByRestaurant = dict.fromkeys(restaurant_names, "")
for rest in restaurant_names: 
    filtered= []
    reviewsByRestaurant[rest] = [] # to contain a list of sentences for that restaurant    
    filtered = reviews[reviews['Restaurant'] == rest]
    for i in range(len(filtered)): 
        review_info = {} # to contain key value pairs of label: sentence, date: date
        review_info['Review'] = filtered.iloc[i]['Review_clean']
        review_info['Date'] = filtered.iloc[i]['Date']
        review_info['Label'] = filtered.iloc[i]['Label']
        reviewsByRestaurant[rest].append(review_info)

print(reviewsByRestaurant)



In [35]:
# To get df of restaurant and labels with date
rows = []

for rest in restaurant_names:
    for review_info in reviewsByRestaurant[rest]:
        review_info['Restaurant'] = rest
        rows.append(review_info)

reviews_by_restaurant = pd.DataFrame(rows)
reviews_by_restaurant = reviews_by_restaurant[['Restaurant', 'Review', 'Date', 'Label']]

reviews_by_restaurant.head()

Unnamed: 0,Restaurant,Review,Date,Label
0,Happy Tummy,fresh ingredient friendly peep and so much che...,2016-04-06T00:00:00,1
1,Cibo Italiano,a small selection of italian wine by the glass...,2015-12-24T00:00:00,1
2,Cibo Italiano,cultural relevant singaporean cuisine in very ...,2018-08-19T00:00:00,4
3,Cibo Italiano,generous with the clam,2016-11-28T00:00:00,1
4,Yan kee Noodle House,the plus point is that the price remains the s...,2018-12-28T00:00:00,1


In [36]:
reviews_by_restaurant.head()

Unnamed: 0,Restaurant,Review,Date,Label
0,Happy Tummy,fresh ingredient friendly peep and so much che...,2016-04-06T00:00:00,1
1,Cibo Italiano,a small selection of italian wine by the glass...,2015-12-24T00:00:00,1
2,Cibo Italiano,cultural relevant singaporean cuisine in very ...,2018-08-19T00:00:00,4
3,Cibo Italiano,generous with the clam,2016-11-28T00:00:00,1
4,Yan kee Noodle House,the plus point is that the price remains the s...,2018-12-28T00:00:00,1


In [None]:
reviews_by_restaurant.to_csv("C:\Users\seagate pc\Desktop/BT4222/Group Project/Data_Mining/Project/SentimentAnalysis/data_sentiment.csv")

In [10]:
# To get restaurant name and respective reviews (concatenated)
reviewsByRestaurant = dict.fromkeys(restaurant_names, "")
for rest in restaurant_names: 
    filtered= []
    reviewsByRestaurant[rest] = {} # create a dictionary for sentences (key = label)
    labels = []
    for label in reviews['Label'][reviews['Restaurant'] == rest]:
        if label not in labels:
            labels.append(label)
#     for cat in labels: 
    for i in range(1, 6):
        reviewsByRestaurant[rest][i] = '' # to compile sentences for a label in a restaurant
        for j in labels:
            if i == j:
                filtered= reviews['Review_clean'][(reviews['Restaurant'] == rest) & (reviews['Label'] == j)] # obtain reviews for a label in a restaurant
                for rev in filtered:
                    reviewsByRestaurant[rest][j] += rev

reviewsByRestaurant

{'Happy Tummy': {1: 'fresh ingredient friendly peep and so much cheaper than salad stop a single get you a base of romaine and 5 topping for dollar 6 80',
  2: '',
  3: '',
  4: '',
  5: ''},
 'Cibo Italiano': {1: 'a small selection of italian wine by the glass and beer a well a a good wine listgenerous with the clam',
  2: '',
  3: '',
  4: 'cultural relevant singaporean cuisine in very sweet old conservation bldg',
  5: ''},
 'Yan kee Noodle House': {1: 'the plus point is that the price remains the same at dollar 4 per bowl',
  2: '',
  3: '',
  4: '',
  5: ''},
 'Clinton Street Baking Company & Restaurant': {1: 'same for more i ordered what i thought wa their signature omelette because it had their name on itit looked awful and for dollar 18 i wa not happyall were excellent and the portion are massive you certainly won t go hungry',
  2: 'each component from the just cooked spinach to the creamy hollandaise sauce to the rich crab cake wa just plain deliciousthe jalapeo cornbread wa 

In [11]:
# To get df of restaurant and labels with date
reviews_by_restaurant = pd.DataFrame.from_dict(reviewsByRestaurant)
reviews_by_restaurant = reviews_by_restaurant.transpose()
reviews_by_restaurant = reviews_by_restaurant.reset_index()
# reviews_by_restaurant = reviews_by_restaurant.rename(columns={"index": "Restaurant"})
reviews_by_restaurant.columns =['Restaurant', 'Value', 'Taste', 'Service', 'Ambience', 'Others'] 

reviews_by_restaurant

Unnamed: 0,Restaurant,Value,Taste,Service,Ambience,Others
0,Happy Tummy,fresh ingredient friendly peep and so much che...,,,,
1,Cibo Italiano,a small selection of italian wine by the glass...,,,cultural relevant singaporean cuisine in very ...,
2,Yan kee Noodle House,the plus point is that the price remains the s...,,,,
3,Clinton Street Baking Company & Restaurant,same for more i ordered what i thought wa thei...,each component from the just cooked spinach to...,service wa prompt but i guess a true test of t...,,place is a staplewould go back once in a while
4,Song Fa Bak Kut Teh,a large portion is around dollar 10 singaporea...,we ordered some vegetable for the table bowl o...,can go up to an hour waitqueue can get pretty ...,located at the bukit timah saddle club we take...,i will probably get that again have been very ...
5,Brawn & Brains Coffee,i don t get why it can t just be full service ...,,,this place is pretty cute and definitely remin...,
6,No. 18 Zion Road Char Kway Teow,i stood in line patiently and got a large plat...,if you want solid noodle you can go here and i...,,,
7,Lawry's The Prime Rib Singapore,price is on the high side,a cut above the rest lawry s the prime rib wha...,our main course came soon after the chef rolle...,place is always crowded with poor ventilation,it turned out to be hilarious indeed and then ...
8,Old Airport Road Food Centre,and yay for mantous 50cents eachhighly recomme...,lastly grabbed a hot red paste bun from the on...,plenty of seating both inside and out,this is one of the many hawker center in singa...,with endless option such a prawn noodle wonton...
9,Dolce Vita,good thing never come cheap and i have to agre...,,the lady at the station wa super friendly and ...,very good ambience a well with pool side view ...,


## Classification

In [13]:
X = reviews.Review_clean # feature matrix
y = reviews.Stars # response vector

X.head()

0    fresh ingredient friendly peep and so much che...
1    a small selection of italian wine by the glass...
2    the plus point is that the price remains the s...
3    same for more i ordered what i thought wa thei...
4    i will probably get that again have been very ...
Name: Review_clean, dtype: object

In [14]:
print(X[1])

a small selection of italian wine by the glass and beer a well a a good wine list


In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [16]:
# Define a function that accepts a vectorizer, a model and calculates the accuracy
def tokenize_predict(vect, model):
    
    # create document-term matrices using the vectorizer
    x_train_dtm = vect.fit_transform(x_train)
    x_test_dtm = vect.transform(x_test)
    
    # print the number of features that were generated
    print('Features: ', x_train_dtm.shape[1])
    
    model.fit(x_train_dtm, y_train)
    
    trainpred = model.predict(x_train_dtm) # for train data
    testpred = model.predict(x_test_dtm) # for test data
    
    # print the accuracy of predictions
    print('Train Accuracy: ', metrics.accuracy_score(y_train, trainpred))
    print('Test Accuracy: ', metrics.accuracy_score(y_test, testpred))
    print('Train AUROC: ', metrics.roc_auc_score(y_train, trainpred))
    print('Test AUROC: ', metrics.roc_auc_score(y_test, testpred))

In [17]:
# Lemmatize words
class LemmaTokenizer:
     def __init__(self):
         self.wnl = WordNetLemmatizer()

     def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [22]:
# Import and instantiate Count Vectorizer
vect = CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=LemmaTokenizer(),
                vocabulary=None)

In [26]:
logreg = LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
# (setting multi_class=”ovr”)
# (setting multi_class="multinomial")

In [27]:
dtm = vect.fit_transform(reviews_by_restaurant['compiled'])
dtm.shape
dtm

KeyError: 'compiled'

In [None]:
features = vect.get_feature_names()
features

## Word Embeddings

In [12]:
from gensim.models import Word2Vec

all_sentences = [nltk.sent_tokenize(txt) for txt in x_train] # splits into sentences
all_sentences

NameError: name 'x_train' is not defined

In [None]:
all_words = []

# for sentence in all_sentences: 
#     for text in sentence:
        
all_words = [nltk.word_tokenize(text) for sentence in all_sentences for text in sentence ]
all_words

In [None]:
word2vec = Word2Vec(text, min_count=1,size= 50,workers=3, window =3, sg = 1)

In [None]:
vocabulary = word2vec.wv
print(vocabulary)

## Getting TFIDF scores

In [None]:
# To get frequent words using TFIDF

word_scores = {}

def summarize(review_id): # works for each row

    # choose a random review that is at least 400 characters - keep looping until review has > 400 words 
#     review_length = 0
#     while review_length < 400:
#         review_id = np.random.randint(0, data.shape[0])
    review_text = reviews_by_restaurant.loc[review_id, 'compiled']
#         review_length = len(review_text)
    
    # create a list of all unique words in the review (minus stop words) using CountVectorizer
    vect = CountVectorizer(stop_words='english')
    vect.fit([review_text])
    unique_words = vect.get_feature_names()

    # create a dictionary of words and their TF-IDF scores
#     word_scores = {}
    for word in unique_words:
        word_scores[word] = dtm[review_id, features.index(word)]
    
    # print words with the top 5 TF-IDF scores
    print('TOP TF-IDF SCORING WORDS:')
    top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[0:5]
    for word, score in top_scores:
        print(word)
    
    # print 5 random words (for comparison)
    print('\n' + 'RANDOM WORDS:')
    random_words = np.random.choice(list(word_scores.keys()), size=5, replace=False)
    for word in random_words:
        print(word)
    
    # print the review
    print('\n' + review_text)

In [None]:
for i in range(len(reviews_by_restaurant)): 
    summarize(i)

In [None]:
word_scores