In [1]:
#Installing packages
!pip install sentence_transformers
!pip install scikit_plot
!pip install numpy pandas matplotlib



In [2]:
#importing basic important libaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("Hotels Dataset.csv")

In [4]:
df.head()

Unnamed: 0,label,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [5]:
df.columns

Index(['label', 'hotel', 'polarity', 'source', 'text'], dtype='object')

In [6]:
df=df[['text', 'label']]

In [7]:
df.head()

Unnamed: 0,text,label
0,We stayed for a one night getaway with family ...,truthful
1,Triple A rate with upgrade to view room was le...,truthful
2,This comes a little late as I'm finally catchi...,truthful
3,The Omni Chicago really delivers on all fronts...,truthful
4,I asked for a high floor away from the elevato...,truthful


In [8]:
df.isna().sum()

text     0
label    0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1600 non-null   object
 1   label   1600 non-null   object
dtypes: object(2)
memory usage: 25.1+ KB


In [10]:
df['label'].unique()

array(['truthful', 'deceptive'], dtype=object)

In [11]:
df.shape

(1600, 2)

In [12]:
df['label'].value_counts()

label
truthful     800
deceptive    800
Name: count, dtype: int64

In [13]:
label_map={'truthful':0, 'deceptive':1}

In [14]:
mapped_label=list(map(label_map.get, df['label']))

In [15]:
df['mapped_label']=mapped_label

In [16]:
df.sample(5)

Unnamed: 0,text,label,mapped_label
391,We booked this hotel for the second leg of our...,truthful,0
1100,I stay in Chicago all the time on business and...,truthful,0
570,The Affinia Manhattan is fantastic! My husband...,deceptive,1
1359,I received the type of room that I had reserve...,deceptive,1
783,"I have to admit, I was a little leery about vi...",deceptive,1


In [17]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
encoded_label=encoder.fit_transform(df['label'])
print(encoded_label)

[1 1 1 ... 0 0 0]


In [18]:
df['encoded_label']=1-encoded_label

In [19]:
df.sample(4)

Unnamed: 0,text,label,mapped_label,encoded_label
1056,We enjoyed the Hotel Monaco. Great location fo...,truthful,0,0
14,I got a Sunday night stay for only $50 off of ...,truthful,0,0
432,"If you're looking for luxuary, look no more. T...",deceptive,1,1
701,Last week I stayed at the Hilton Chicago for 4...,deceptive,1,1


In [20]:
#Feature selection
X=df['text']
y=df['encoded_label']

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_trn, X_tst, y_train, y_test=train_test_split(X,y, test_size=.2, random_state=43)

In [23]:
from sentence_transformers import SentenceTransformer

In [24]:
#Importing NLP feature extraction transformer bases model: 
# BERT- Bi-directional encoder representation from transformers

#The other state of the art NLP models are: Glove, Word2Vec, TFIDF; etc
#but here we will be using the best state of the art nlp model called as BERT


bert_model=SentenceTransformer('bert-base-nli-mean-tokens')

In [25]:
#Generating the embeddings for train data and saving them as file 
train_embeddings=bert_model.encode(X_trn.tolist())


np.save(r'/Users/beenish/Desktop/Project/xtrn.npy', train_embeddings)


In [26]:
#Generating the embeddings for test data and saving them as file 

test_embeddings=bert_model.encode(X_tst.tolist())

np.save(r'/Users/beenish/Desktop/Project/xtst.npy',test_embeddings)

In [27]:
X_train=np.load(r'/Users/beenish/Desktop/Project/xtrn.npy', allow_pickle=True)
X_test=np.load(r'/Users/beenish/Desktop/Project/xtst.npy', allow_pickle=True)

In [28]:
from sklearn.svm import SVC

svc_classifier=SVC(kernel='linear') #check out about the kernels in SVM


In [29]:
svc_classifier.fit(X_train,y_train)

In [30]:
pred=svc_classifier.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [32]:
from numpy import interp

In [33]:
accuracy_score(pred, y_test)*100

83.75

In [34]:
import pickle

with open('trained_fakereview_model.pkl', 'wb') as f:
    pickle.dump(svc_classifier,f)

In [35]:
#First review check

new_review=["The battery is long lasting and the case is sturdy, it has worked for me for almost a year now. The voice and sound quality is amazing, it's almost noise cancelling, and is worth the money for sure, it has a sleek finish and the buttons on the side are quite accessible. The mic quality although could've been improved as the voice isn't clear when we speak with the in-built mic of the headphones."]

new_review_embedding=bert_model.encode(new_review)

prediction=svc_classifier.predict(new_review_embedding)

if prediction==0:
    print("The review is 'Truthful'")
else:
    print("The review is 'Deceptive'")


The review is 'Deceptive'


In [36]:
#Second review check

new_review=["Nice and fast delivery within the promise time. The highlight is open delivery so we can cross check while delivery the product and no Plastic bags for packing.."]

new_review_embedding=bert_model.encode(new_review)

prediction=svc_classifier.predict(new_review_embedding)

if prediction==0:
    print("The review is 'Truthful'")
else:
    print("The review is 'Decptive'")


The review is 'Truthful'
