# Importing required Library 

In [13]:
!pip install wordcloud
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
from wordcloud import WordCloud
import numpy as np
import nltk
nltk.download([
    'stopwords', 'wordnet',
    'punkt', 'omw-1.4', 'vader_lexicon'
])
%matplotlib inline



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BUSINESS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BUSINESS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BUSINESS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\BUSINESS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\BUSINESS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
#Converting a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn import metrics

## Actual Data Loading 

In [15]:
data = pd.read_csv("tourist_accommodation_reviews.csv")
data.shape

(53644, 5)

In [16]:
data.head()

Unnamed: 0,ID,Review Date,Location,Hotel/Restaurant name,Review
0,rn579778340,Reviewed 1 week ago,Kathu,Thong Dee The Kathu Brasserie,Just been for sunday roast lamb and beef truly...
1,rn576350875,Reviewed 3 weeks ago,Kathu,Thong Dee The Kathu Brasserie,"Quietly set off the main road, nice atmosphere..."
2,rn574921678,Reviewed 4 weeks ago,Kathu,Thong Dee The Kathu Brasserie,I made a reservation for a birthday two days i...
3,rn572905503,"Reviewed April 12, 2018",Kathu,Thong Dee The Kathu Brasserie,We visit here regularly and never fail to be i...
4,rn572364712,"Reviewed April 10, 2018",Kathu,Thong Dee The Kathu Brasserie,Visited this wonderful place on my travels and...


In [17]:
#Unique Restaurants
data['Hotel/Restaurant name'].nunique()

537

In [18]:
#Unique Location
data['Location'].nunique()

25

## Check for Null values 

In [19]:
data.isnull().sum()

ID                       0
Review Date              0
Location                 0
Hotel/Restaurant name    0
Review                   0
dtype: int64

## Check on Total Reviews per Restaurant

In [20]:
HotelReviewCount = pd.DataFrame(data.groupby(['Hotel/Restaurant name']).\
                         agg(Total_Reviews=('Review','count'))).reset_index().\
                    sort_values(by="Total_Reviews",ascending=False)
HotelReviewCount

Unnamed: 0,Hotel/Restaurant name,Total_Reviews
110,Da Mario,279
298,No.6 Restaurant,200
383,Sabai Sabai,200
483,The Pizza Company,198
470,The Family Restaurant,196
...,...,...
433,Sultan's Grill Authentic Turkish & Indian Cuisine,91
0,+39 Italian Street Food,91
93,Chez Nicolas,90
115,Dada Yura Restaurant,80


## Pre-Processing reviews to get a cleaned and processed version to analyze

In [21]:
#Pre-Processing reviews to get a cleaned and processed version to analyze
!pip install contractions
!pip install spacy
import re

import contractions
import spacy
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [22]:
reviews = pd.DataFrame(data['Review'])

In [23]:
reviews

Unnamed: 0,Review
0,Just been for sunday roast lamb and beef truly...
1,"Quietly set off the main road, nice atmosphere..."
2,I made a reservation for a birthday two days i...
3,We visit here regularly and never fail to be i...
4,Visited this wonderful place on my travels and...
...,...
53639,"I love this small restaurant, for the great fo..."
53640,We stopped at this restaurant after shopping a...
53641,"Great times ,This is one of the best restauran..."
53642,"Bite in, it has become my favorite restaurant ..."


In [24]:
spacy.cli.download('en_core_web_sm')

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
nlp = spacy.load('en_core_web_sm')
ps = nltk.porter.PorterStemmer()

## Defining Functions for Data Cleaning and Pre-processing

In [26]:
def expand_contractions(text):
    return contractions.fix(text)


def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

## Preliminary Data Pre-Processing Steps

In [27]:
def text_pre_processor(text, contraction_expansion=True,
                       text_lower_case=True, text_lemmatization=True, 
                       special_char_removal=True, remove_digits=True, stopword_removal=True, 
                       stopword_list=None):
    # lowercase the text    
    if text_lower_case:
        text = text.lower()

    # expand contractions    
    if contraction_expansion:
        text = expand_contractions(text)
        
    # lemmatize text
    if text_lemmatization:
        text = spacy_lemmatize_text(text)
        
    # remove special characters and\or digits    
    if special_char_removal:
        text = remove_special_characters(text, remove_digits=remove_digits)

    # remove stopwords
    if stopword_removal:
        text = remove_stopwords(text)
        
    # remove extra whitespace
    text = re.sub(' +', ' ', text)
    text = text.strip()
    
    return text

In [28]:
reviews['Processed_review'] = reviews['Review'].apply(lambda x : text_pre_processor(x))

In [29]:
reviews.head()

Unnamed: 0,Review,Processed_review
0,Just been for sunday roast lamb and beef truly...,sunday roast lamb beef truly excellentout coud...
1,"Quietly set off the main road, nice atmosphere...",quietly set main road nice atmosphere immacula...
2,I made a reservation for a birthday two days i...,make reservation birthday two day advance assu...
3,We visit here regularly and never fail to be i...,visit regularly never fail impress quality pre...
4,Visited this wonderful place on my travels and...,visit wonderful place travel complain anything...


In [30]:
reviews.to_csv("Raw_CleanedReviews.csv",index=False)