In [1]:
import numpy as np
import pandas as pd
import re
import nltk
#nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer    
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
import pickle
from sklearn import svm

In [3]:
dataset = pd.read_csv('deceptive-opinion.csv')
X=dataset["text"]
y = np.where(dataset['deceptive']=='truthful', 1, 0)

In [4]:
#Cleaning the text
def cleaning(X):
    corpus=[]
    for i in range(0,len(X)):
        review = re.sub('[^a-zA-Z]',' ',X[i])
        review=review.lower()
        review=review.split()
        ps=PorterStemmer()
        review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review=' '.join(review)
        corpus.append(review)
    return corpus

In [5]:
corpus = cleaning(X)
cv=TfidfVectorizer()
cv.fit(corpus)
x=cv.transform(corpus).toarray()
le=LabelEncoder()
y=le.fit_transform(y)  

In [6]:
pickle.dump(cv, open("vectorizer.h5", 'wb'))

In [7]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

In [8]:
nb=GaussianNB()
nb.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [9]:
rf=RandomForestClassifier(n_estimators=100,criterion='entropy')
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
svm = svm.SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
knn=KNeighborsClassifier().fit(X_train, y_train)
lg=LogisticRegression().fit(X_train, y_train)



In [12]:
#predition
pred_nb=nb.predict(X_test)
print('Accuracy score: {}'.format(accuracy_score(y_test, pred_nb)))
print('Precision score: {}'.format(precision_score(y_test, pred_nb)))
print('Recall score: {}'.format(recall_score(y_test, pred_nb)))
print('F1 score: {}'.format(f1_score(y_test, pred_nb)))

pred_rf=rf.predict(X_test)
print('\nAccuracy score: {}'.format(accuracy_score(y_test, pred_rf)))
print('Precision score: {}'.format(precision_score(y_test, pred_rf)))
print('Recall score: {}'.format(recall_score(y_test, pred_rf)))
print('F1 score: {}'.format(f1_score(y_test, pred_rf)))

pred_svm= svm.predict(X_test)
print('\nAccuracy score: {}'.format(accuracy_score(y_test, pred_svm)))
print('Precision score: {}'.format(precision_score(y_test, pred_svm)))
print('Recall score: {}'.format(recall_score(y_test, pred_svm)))
print('F1 score: {}'.format(f1_score(y_test, pred_svm)))


pred_lg= lg.predict(X_test)
print('\nAccuracy score: {}'.format(accuracy_score(y_test, pred_lg)))
print('Precision score: {}'.format(precision_score(y_test, pred_lg)))
print('Recall score: {}'.format(recall_score(y_test, pred_lg)))
print('F1 score: {}'.format(f1_score(y_test, pred_lg)))


pred_knn= knn.predict(X_test)
print('\nAccuracy score: {}'.format(accuracy_score(y_test, pred_knn)))
print('Precision score: {}'.format(precision_score(y_test, pred_knn)))
print('Recall score: {}'.format(recall_score(y_test, pred_knn)))
print('F1 score: {}'.format(f1_score(y_test, pred_knn)))

Accuracy score: 0.634375
Precision score: 0.6319444444444444
Recall score: 0.5870967741935483
F1 score: 0.608695652173913

Accuracy score: 0.871875
Precision score: 0.8518518518518519
Recall score: 0.8903225806451613
F1 score: 0.8706624605678234

Accuracy score: 0.875
Precision score: 0.8807947019867549
Recall score: 0.8580645161290322
F1 score: 0.869281045751634

Accuracy score: 0.875
Precision score: 0.8807947019867549
Recall score: 0.8580645161290322
F1 score: 0.869281045751634

Accuracy score: 0.9
Precision score: 0.9019607843137255
Recall score: 0.8903225806451613
F1 score: 0.8961038961038961

Accuracy score: 0.721875
Precision score: 0.775
Recall score: 0.6
F1 score: 0.6763636363636363


In [13]:
print(nb.predict(X_test[0:30]))
print(rf.predict(X_test[0:30]))
print(svm.predict(X_test[0:30]))
print(lg.predict(X_test[0:30]))
print(knn.predict(X_test[0:30]))

print("\n"+str(y_test[0:30]))

[1 1 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0]
[1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1]
[1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1]
[1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1]
[1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1]
[0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1]

[1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1]


In [14]:
print(nb.predict_proba(X_test[5:8]))
print(rf.predict_proba(X_test[5:8]))
print(svm.predict_proba(X_test[5:8]))
print(lg.predict_proba(X_test[5:8]))
print(knn.predict_proba(X_test[5:8]))

[[0. 1.]
 [0. 1.]
 [1. 0.]]
[[0.59 0.41]
 [0.3  0.7 ]
 [0.72 0.28]]
[[0.8929425  0.1070575 ]
 [0.01256114 0.98743886]
 [0.97183386 0.02816614]]
[[0.61030914 0.38969086]
 [0.23485585 0.76514415]
 [0.7481794  0.2518206 ]]
[[0.6 0.4]
 [0.8 0.2]
 [0.8 0.2]]


In [15]:
rf_model = 'rf.h5'
svm_model = 'svm.h5'
lg_model = 'lg.h5'
knn_model = 'knn.h5'
pickle.dump(rf, open(rf_model, 'wb'))
pickle.dump(svm, open(svm_model, 'wb'))
pickle.dump(lg, open(lg_model, 'wb'))
pickle.dump(knn, open(knn_model, 'wb'))


In [16]:
load_rf = pickle.load(open(rf_model, 'rb'))
load_svm = pickle.load(open(svm_model, 'rb'))
load_lg = pickle.load(open(lg_model, 'rb'))
load_knn = pickle.load(open(knn_model, 'rb'))

In [17]:
def preparing(X):
    review = re.sub('[^a-zA-Z]',' ',X)
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    review=[review]
    return review

In [18]:
fake_review="My family and I are huge fans of this place. The staff is super nice, and the food is great. The chicken is very good, and the garlic sauce is perfect. Ice cream topped with fruit is delicious too. Highly recommended!"

In [19]:
fake_review2="Well presented in green and compostable  take out containers. the food had texture, it was hot, it was a nice portion but was lacking in the most relevant, flavor. The soy based sauce didn't do it for either even when I doctored it with hot sauce.I was hoping for a Thai flavor experience with a hint of lemongrass and galanga. Nope, boiled chicken served with rice and a nice hot cup of chicken broth. The broth was the best part."

In [23]:
review_test = preparing(fake_review)
review_test = cv.transform(review_test).toarray()
print(load_rf.predict(review_test))
print(load_rf.predict_proba(review_test))
print(load_svm.predict(review_test))
print(load_svm.predict_proba(review_test))
print(load_lg.predict(review_test))
print(load_lg.predict_proba(review_test))
print(load_knn.predict(review_test))
print(load_knn.predict_proba(review_test))

[0]
[[0.52 0.48]]
[1]
[[0.05722872 0.94277128]]
[1]
[[0.31167623 0.68832377]]
[0]
[[0.8 0.2]]
