Using the hotel reviews dataset, create a sentiment analysis model using at least one of the methods described this week (you’re welcome to create more than one). Be sure to have three data slices - train, validation, and test as specified in the text. 

In [2]:
import pandas as pd
review_dataset = pd.read_csv(r"C:\Users\bharo\Downloads\hotel-reviews.csv\hotel-reviews.csv")
review_dataset

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy
...,...,...,...,...,...
38927,id49253,We arrived late at night and walked in to a ch...,Edge,Desktop,happy
38928,id49254,The only positive impression is location and p...,InternetExplorer,Mobile,not happy
38929,id49255,Traveling with friends for shopping and a show...,Firefox,Mobile,not happy
38930,id49256,The experience was just ok. We paid extra for ...,Chrome,Desktop,not happy


In [3]:
# removing  columns
df = review_dataset.drop(columns = ['User_ID','Browser_Used','Device_Used'])
df


Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy
...,...,...
38927,We arrived late at night and walked in to a ch...,happy
38928,The only positive impression is location and p...,not happy
38929,Traveling with friends for shopping and a show...,not happy
38930,The experience was just ok. We paid extra for ...,not happy


In [7]:
# cleaning the data
import string
import re
def text_clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub('[%s]'% re.escape(string.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('\n','',text)
    
    return text

cleaned = lambda x: text_clean(x)

df['cleaned_description'] = pd.DataFrame(df.Description.apply(cleaned))
df

                  

Unnamed: 0,Description,Is_Response,cleaned_description
0,The room was kind of clean but had a VERY stro...,not happy,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,not happy,i stayed at the crown plaza april april th...
2,I booked this hotel through Hotwire at the low...,not happy,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,happy,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,not happy,my girlfriends and i stayed here to celebrate ...
...,...,...,...
38927,We arrived late at night and walked in to a ch...,happy,we arrived late at night and walked in to a ch...
38928,The only positive impression is location and p...,not happy,the only positive impression is location and p...
38929,Traveling with friends for shopping and a show...,not happy,traveling with friends for shopping and a show...
38930,The experience was just ok. We paid extra for ...,not happy,the experience was just ok we paid extra for a...


In [21]:
# training the model
from sklearn.model_selection import train_test_split

x = df.cleaned_description
y = df.Is_Response

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1, random_state = 225)

print('x_train:', len(x_train))
print('y_train:', len(y_train))
print('x_test:', len(x_test))
print('y_test:', len(y_test))


x_train: 35038
y_train: 35038
x_test: 3894
y_test: 3894


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics


tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs", max_iter=1000 )
model = Pipeline ([('vectorizer',tvec),('classifier',clf2)]) # creating a pipeline
model.fit(x_train,y_train)


prediction = model.predict(x_test)
metrics.confusion_matrix(prediction, y_test)

array([[2421,  297],
       [ 150, 1026]], dtype=int64)

In [36]:
# model prediction
from sklearn.metrics import accuracy_score, precision_score, recall_score

print('Accuracy : ',accuracy_score(prediction, y_test))
print('Precision : ',precision_score(prediction, y_test, average ='weighted'))
print('Recall : ',recall_score(prediction, y_test, average = 'weighted'))

Accuracy :  0.8852080123266564
Precision :  0.8914801157152283
Recall :  0.8852080123266564


In [40]:
# testing the model with new review
example = ['it was good']
result = model.predict(example)
print(result)

['happy']
