In [None]:
import numpy as np
import pandas as pd

Read data into data frame

In [None]:
data_frame = pd.read_csv('/content/tripadvisor_hotel_reviews.csv')
data_frame.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [None]:
data_frame.shape

(20491, 2)

In [5]:
def rating(rate):
    if rate>3 and rate<=5:
        return "Positive"
    if rate>0 and rate<=3:
        return "Negative"

data_frame['State'] = data_frame['Rating'].apply(rating)
data_frame.head()

Unnamed: 0,Review,Rating,State
0,nice hotel expensive parking got good deal sta...,4,Positive
1,ok nothing special charge diamond member hilto...,2,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,Negative
3,"unique, great stay, wonderful time hotel monac...",5,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive


In [6]:
data_frame.shape

(20491, 3)

In [7]:
data_frame['State'].value_counts()

Positive    15093
Negative     5398
Name: State, dtype: int64

# **Sentiment Analysis**

In [8]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentAnalyze = SentimentIntensityAnalyzer()
data_frame['Scores'] = data_frame['Review'].apply(lambda reviews: sentAnalyze.polarity_scores(reviews))
data_frame.head()

Unnamed: 0,Review,Rating,State,Scores
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co..."
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com..."
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp..."
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com..."
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co..."


In [26]:
data_frame['Compound'] = data_frame['Scores'].apply(lambda f:f['compound'])
data_frame.head()

Unnamed: 0,Review,Rating,State,Scores,Compound
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797


In [27]:
data_frame.shape

(20491, 5)

In [28]:
data_frame['Compound State'] = data_frame['Compound'].apply(lambda score: 'Positive' if score>=0 else 'Negative')
data_frame.head()

Unnamed: 0,Review,Rating,State,Scores,Compound,Compound State
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747,Positive
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787,Positive
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889,Positive
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797,Positive


In [29]:
data_frame.shape

(20491, 6)

Example

In [30]:
Example='Hotel was nice and I enjoyed'
sentAnalyze.polarity_scores(Example)

{'neg': 0.0, 'neu': 0.33, 'pos': 0.67, 'compound': 0.7269}

# **Text Classification**

In [31]:
X = data_frame['Review']
Y = data_frame['State']

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [34]:
textclf=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
textclf.fit(X_train,y_train)

In [35]:
pred=textclf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred))

0.8978529603122967


Testing

In [37]:
new_reveiw = [("Hotel was perfect ,i liked it")]
textclf.predict(new_reveiw)

array(['Positive'], dtype=object)

In [42]:
new_review2 = [("i hated it")]
textclf.predict(new_review2)

array(['Negative'], dtype=object)