**Save the data into .csv format because while model building it will give an error regarding memory**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Libraries for Text data
import spacy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Read the dataset
data = pd.read_csv('hotel_reviews.csv')
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


# Text Preprocessing

In [4]:
# Copy the Data
data1 = data.copy()
data1.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
# Word Length of reviews
data1['word_len_review'] = data1['Review'].apply(lambda x: len(x.split()))

# String Length of reviews
data1['string_len_review'] = data1['Review'].apply(lambda x: len(x))

data1['cleaned']=data1['Review'].apply(lambda x: x.lower())
data1['cleaned']=data1['cleaned'].apply(lambda x: re.sub('[^a-z]',' ', x))
data1['cleaned']=data1['cleaned'].apply(lambda x: re.sub(' +',' ',x))
data1.head()

Unnamed: 0,Review,Rating,word_len_review,string_len_review,cleaned
0,nice hotel expensive parking got good deal sta...,4,87,593,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,250,1689,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,217,1427,nice rooms not experience hotel monaco seattle...
3,"unique, great stay, wonderful time hotel monac...",5,89,600,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,191,1281,great stay great stay went seahawk game awesom...


In [6]:
def lemma(text):
    wordnet=WordNetLemmatizer()
    for i in range(len(text)):
        review = word_tokenize(text)
        review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        return review

In [7]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [8]:
data1['lemmatization']=data1['cleaned'].progress_apply(lambda x: lemma(x))
data1.head()

  0%|          | 0/20491 [00:00<?, ?it/s]

Unnamed: 0,Review,Rating,word_len_review,string_len_review,cleaned,lemmatization
0,nice hotel expensive parking got good deal sta...,4,87,593,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,250,1689,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,217,1427,nice rooms not experience hotel monaco seattle...,nice room experience hotel monaco seattle good...
3,"unique, great stay, wonderful time hotel monac...",5,89,600,unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,191,1281,great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...


In [9]:
pos = [5,4]
neg = [1,2]
neu=[3]

def sentiment(rating):
    if rating in pos:
        return "positive"
    elif rating in neg:
        return "negative"
    elif rating in neu:
        return "neutral"
    
def label(rating):
    if rating in pos:
        return 1
    elif rating in neg:
        return -1
    elif rating in neu:
        return 0

data1['Sentiment'] = data1['Rating'].apply(sentiment)
data1.head()

Unnamed: 0,Review,Rating,word_len_review,string_len_review,cleaned,lemmatization,Sentiment
0,nice hotel expensive parking got good deal sta...,4,87,593,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,positive
1,ok nothing special charge diamond member hilto...,2,250,1689,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,negative
2,nice rooms not 4* experience hotel monaco seat...,3,217,1427,nice rooms not experience hotel monaco seattle...,nice room experience hotel monaco seattle good...,neutral
3,"unique, great stay, wonderful time hotel monac...",5,89,600,unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...,positive
4,"great stay great stay, went seahawk game aweso...",5,191,1281,great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...,positive


In [10]:
data1['Sentiment'].value_counts()

positive    15093
negative     3214
neutral      2184
Name: Sentiment, dtype: int64

In [11]:
data1['label'] = data1['Sentiment'].map({'positive':1, 'negative':-1, 'neutral':0})
data1.head()

Unnamed: 0,Review,Rating,word_len_review,string_len_review,cleaned,lemmatization,Sentiment,label
0,nice hotel expensive parking got good deal sta...,4,87,593,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,positive,1
1,ok nothing special charge diamond member hilto...,2,250,1689,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,negative,-1
2,nice rooms not 4* experience hotel monaco seat...,3,217,1427,nice rooms not experience hotel monaco seattle...,nice room experience hotel monaco seattle good...,neutral,0
3,"unique, great stay, wonderful time hotel monac...",5,89,600,unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...,positive,1
4,"great stay great stay, went seahawk game aweso...",5,191,1281,great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...,positive,1


In [12]:
data1.to_csv('hotel_review_final.csv')

# Convert Text Data Into Numerical Data

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tv = TfidfVectorizer(max_features=5000)
X = tv.fit_transform(data1['lemmatization']).toarray()
X=pd.DataFrame(X, columns=tv.get_feature_names())
X

Unnamed: 0,aaa,abc,ability,abit,able,abroad,absolute,absolutely,absolutley,absolutly,...,young,younger,yr,yuck,yum,yummy,yunque,zero,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.140137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04139,0.0,0.0,...,0.025268,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
final_data = pd.concat([data1[['label','word_len_review','string_len_review']],X], axis=1)
final_data

Unnamed: 0,label,word_len_review,string_len_review,aaa,abc,ability,abit,able,abroad,absolute,...,young,younger,yr,yuck,yum,yummy,yunque,zero,zone,zoo
0,1,87,593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1,250,1689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,217,1427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,89,600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,191,1281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,1,109,733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.140137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20487,1,39,306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20488,-1,63,443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20489,-1,781,5557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025268,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
final_data.to_csv('model_building_dataset.csv')

In [20]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler(feature_range = (0 , 1))

In [21]:
minmax = minmax.fit_transform(data1[['word_len_review','string_len_review']])
minmax = pd.DataFrame(minmax, columns=['word_len_review','string_len_review'])
minmax

Unnamed: 0,word_len_review,string_len_review
0,0.041580,0.040797
1,0.126299,0.122241
2,0.109148,0.102772
3,0.042620,0.041317
4,0.095634,0.091922
...,...,...
20486,0.053015,0.051200
20487,0.016632,0.019469
20488,0.029106,0.029650
20489,0.402287,0.409675


In [22]:
final_data = pd.concat([data1['label'],minmax,X], axis=1)
final_data

Unnamed: 0,label,word_len_review,string_len_review,aaa,abc,ability,abit,able,abroad,absolute,...,young,younger,yr,yuck,yum,yummy,yunque,zero,zone,zoo
0,1,0.041580,0.040797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1,0.126299,0.122241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.109148,0.102772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.042620,0.041317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.095634,0.091922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,1,0.053015,0.051200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.140137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20487,1,0.016632,0.019469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20488,-1,0.029106,0.029650,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20489,-1,0.402287,0.409675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025268,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
final_data.to_csv('model_building_dataset1.csv')