The previous experiment attempted to achieve high accuracy by using better features. These features were taken from a publication that claimed they worked. Since we could not replicate the results in the last experiment, here we will try using more data. We should be using a balanced dataset, now that we have x in the negative set we can reduce the positive set to the same number.

In [1]:
from protos import review_set_pb2, review_pb2
review_set = review_set_pb2.ReviewSet()
with open("data/yelpZip", 'rb') as f:
  review_set.ParseFromString(f.read())
print(len(review_set.reviews))

608598


In [4]:
from sklearn.utils import shuffle

fake_reviews = list(filter(lambda x: x.label, review_set.reviews))
counter_fake = len(fake_reviews)
genuine_reviews = []
unused_genuine_reviews = []
counter_genuine = 0
for review in shuffle(review_set.reviews):
  if review.label == True:
    continue
  if counter_genuine <= counter_fake:
    genuine_reviews.append(review)
    counter_genuine += 1
  else:
    unused_genuine_reviews.append(review)
  
concatted_reviews = fake_reviews + genuine_reviews
print("fake:", len(fake_reviews))
print("real:", len(genuine_reviews))
print("all:", len(concatted_reviews))
print("unused real:", len(unused_genuine_reviews))

fake: 80466
real: 80467
all: 160933
unused real: 447665


In [5]:
from exp2_feature_extraction import find_words
all_reviews = [(x, find_words(x.review_content)) for x in concatted_reviews]

In [6]:
from exp2_feature_extraction import structural_features
features_structural = [structural_features(x) for x in all_reviews]

In [7]:
############################
##### DUPLICATION CELL #####
############################

import nltk

def pos_features(words, pos_tagger):
  tag_map = {
    "CD":  0, "DT":  0, "EX":  0, "FW":   0, "IN":  0, "JJ":  0, "JJR": 0, "JJS": 0, "LS":   0,
    "MD":  0, "NN":  0, "NNP": 0, "NNPS": 0, "NNS": 0, "PDT": 0, "POS": 0, "PRP": 0, "PRP$": 0,
    "RB":  0, "RBR": 0, "RBS": 0, "RP":   0, "SYM": 0, "TO":  0, "UH":  0, "VB":  0, "VBD":  0,
    "VBG": 0, "VBN": 0, "VBP": 0, "VBZ":  0, "WDT": 0, "WP":  0, "WP$": 0, "WRB": 0, "CC":   0
  }
  tags = pos_tagger.pos_tag(words)
  total = 0
  for tag in tags:
    key = tag[1]
    if key in tag_map:
      tag_map[key] += 1
      total += 1
  if total == 0:
    return [0] * 36
  order = ["CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS",
           "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH",
           "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "CC"]
  return [tag_map[x]/total for x in order]

In [8]:
features_pos = [pos_features(x[1], nltk) for x in all_reviews]

In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()
from exp2_feature_extraction import sentiment_features
features_sentiment = [sentiment_features(x[1], sentiment_analyzer) for x in all_reviews]



In [10]:
############################
##### DUPLICATION CELL #####
############################

from exp2_feature_extraction import preprocess_words
import gensim

def get_topic_features_maker(reviews):
  num_topics = 10
  preprocessed_words = [preprocess_words(x[1]) for x in reviews]
  
  dictionary = gensim.corpora.Dictionary(preprocessed_words)
  dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
  bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_words]
  lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2)
    
  for index, topic in lda_model.show_topics(formatted=False, num_words=3):
    print('{}: {}'.format(index, [w[0] for w in topic]))
  
  def make_topic_features(review_words):
    topics = lda_model.get_document_topics(dictionary.doc2bow(preprocess_words(review_words)))
    return topic_features(topics, num_topics)
  return make_topic_features

topic_features_maker = get_topic_features_maker(all_reviews)

0: ['place', 'like', 'food']
1: ['pizza', 'best', 'slice']
2: ['chicken', 'sauc', 'good']
3: ['order', 'tabl', 'come']
4: ['dish', 'order', 'dessert']
5: ['brunch', 'coffe', 'egg']
6: ['food', 'great', 'place']
7: ['ramen', 'menu', 'nice']
8: ['burger', 'beer', 'drink']
9: ['good', 'place', 'food']


In [12]:
from exp2_feature_extraction import topic_features
features_topic = [topic_features_maker(x[1]) for x in all_reviews]

In [13]:
############################
##### DUPLICATION CELL #####
############################

from exp2_feature_extraction import find_capitalised_word_ratio
from exp2_feature_extraction import max_date_occurrences
import statistics
import functools

def reviewer_features(review, reviews_by_reviewer):
  reviews = reviews_by_reviewer[review.user_id]
  max_reviews_in_day = max_date_occurrences(reviews)
  average_review_length = functools.reduce(lambda total, review: total + len(review.review_content), reviews, 0) / len(reviews)
  ratings_stdev = 0 if len(reviews) == 1 else statistics.stdev([x.rating for x in reviews])
  return (max_reviews_in_day, average_review_length, ratings_stdev)

In [14]:
from exp2_feature_extraction import reviews_by_reviewer
reviews_reviewer_map = reviews_by_reviewer([x[0] for x in all_reviews])
features_reviewer = [reviewer_features(x[0], reviews_reviewer_map) for x in all_reviews]

In [15]:
from scipy.sparse import coo_matrix, hstack
predictor_features = hstack([coo_matrix(features_structural), coo_matrix(features_sentiment), coo_matrix(features_pos),
                             coo_matrix(features_topic), coo_matrix(features_reviewer)])

In [16]:
targets = [x[0].label for x in all_reviews]

In [17]:
from sklearn.naive_bayes import MultinomialNB
cnb = MultinomialNB()

In [18]:
cnb.fit(predictor_features, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
from sklearn.model_selection import cross_validate
cross_validate(cnb, predictor_features, targets, cv=10, return_train_score=False)

{'fit_time': array([0.10538673, 0.1095612 , 0.11662054, 0.11348391, 0.10610676,
        0.10550308, 0.10498762, 0.10460353, 0.10838938, 0.10805106]),
 'score_time': array([0.00495863, 0.00488496, 0.00506663, 0.00478959, 0.00477529,
        0.00490165, 0.00547576, 0.00493288, 0.00593281, 0.00481367]),
 'test_score': array([0.59108985, 0.5968684 , 0.594383  , 0.59587424, 0.60500808,
        0.58226668, 0.60939539, 0.60514541, 0.58861546, 0.59364902])}

In [None]:
# TODO, what's the point of having the num_intervals?
def print_num_fake_for_set(test_set, start, interval, num_intervals):
  for i in range(1, num_intervals):
    unused_reviews = [(x, find_words(x.review_content)) for x in test_set[start:start+interval]]

    unused_structural = [structural_features(x) for x in unused_reviews]

    unused_sentiment = [sentiment_features(x[1], sentiment_analyzer) for x in unused_reviews]

    unused_pos = [pos_features(x[1], nltk) for x in unused_reviews]

    unused_topic = [topic_features_maker(x[1]) for x in unused_reviews]

    unused_reviewer_map = reviews_by_reviewer([x[0] for x in unused_reviews])
    unused_reviewer = [reviewer_features(x[0], unused_reviewer_map) for x in unused_reviews]

    unused_features = hstack([coo_matrix(unused_structural), coo_matrix(unused_sentiment), coo_matrix(unused_pos),
                              coo_matrix(unused_topic), coo_matrix(unused_reviewer)])

    results = cnb.predict(unused_features)
    count = len([x for x in results if x])

    print(count, "out of", len(results))
    start+=interval
    
print("Fake reviews:")
print_num_fake_for_set(fake_reviews, 0, 1000, 5)
print("Genuine reviews:")
print_num_fake_for_set(genuine_reviews, 0, 1000, 5)
print("Unused genuine reviews:")
print_num_fake_for_set(unused_genuine_reviews, 0, 1000, 5)

Fake reviews:
799 out of 1000
708 out of 1000
674 out of 1000
712 out of 1000
Genuine reviews:
560 out of 1000
550 out of 1000
568 out of 1000
539 out of 1000
Unused genuine reviews:


As we can see the classifier much is more accurate at guessing fake reviews than genuine reviews. This is probably because it is biased towards choosing fake for any arbitrary review. Since we have already balanced our data set, we can help fix this situation by changing the weights of our model, so that they are more fair towards genuine reviews.