The previous experiment attempted to achieve high accuracy by using better features. These features were taken from a publication that claimed they worked. Since we could not replicate the results in the last experiment, here we will try using more data. We should be using a balanced dataset, now that we have x in the negative set we can reduce the positive set to the same number.

In [2]:
from protos import review_set_pb2, review_pb2
review_set = review_set_pb2.ReviewSet()
with open("data/yelpZip", 'rb') as f:
  review_set.ParseFromString(f.read())
print(len(review_set.reviews))

608598


In [14]:
fake_reviews = list(filter(lambda x: x.label, review_set.reviews))
counter_fake = len(fake_reviews)
genuine_reviews = []
counter_genuine = 0
for review in review_set.reviews:
  if review.label == False:
    genuine_reviews.append(review)
    counter_genuine += 1
  if counter_genuine == counter_fake:
    break
  
concatted_reviews = fake_reviews + genuine_reviews
print("fake:", len(fake_reviews))
print("real:", len(genuine_reviews))
print("all:", len(concatted_reviews))

fake: 80466
real: 80466
all: 160932


In [None]:
from exp2_feature_extraction import find_words
from sklearn.utils import shuffle
all_reviews = [(x, find_words(x.review_content)) for x in shuffle(concatted_reviews)]

In [None]:
from exp2_feature_extraction import structural_features
features_structural = [structural_features(x) for x in all_reviews]

In [None]:
############################
##### DUPLICATION CELL #####
############################

import nltk

def pos_features(words, pos_tagger):
  tag_map = {
    "CD":  0, "DT":  0, "EX":  0, "FW":   0, "IN":  0, "JJ":  0, "JJR": 0, "JJS": 0, "LS":   0,
    "MD":  0, "NN":  0, "NNP": 0, "NNPS": 0, "NNS": 0, "PDT": 0, "POS": 0, "PRP": 0, "PRP$": 0,
    "RB":  0, "RBR": 0, "RBS": 0, "RP":   0, "SYM": 0, "TO":  0, "UH":  0, "VB":  0, "VBD":  0,
    "VBG": 0, "VBN": 0, "VBP": 0, "VBZ":  0, "WDT": 0, "WP":  0, "WP$": 0, "WRB": 0, "CC":   0
  }
  tags = pos_tagger.pos_tag(words)
  total = 0
  for tag in tags:
    key = tag[1]
    if key in tag_map:
      tag_map[key] += 1
      total += 1
  if total == 0:
    return [0] * 36
  order = ["CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS",
           "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH",
           "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "CC"]
  return [tag_map[x]/total for x in order]

In [None]:
features_pos = [pos_features(x[1], nltk) for x in all_reviews]

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()
from exp2_feature_extraction import sentiment_features
features_sentiment = [sentiment_features(x[1], sentiment_analyzer) for x in all_reviews

In [None]:
############################
##### DUPLICATION CELL #####
############################

import gensim

def get_topic_features_maker(reviews):
  num_topics = 10
  preprocessed_words = [preprocess_words(x[1]) for x in reviews]
  
  dictionary = gensim.corpora.Dictionary(preprocessed_words)
  dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
  bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_words]
  lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2)
    
  for index, topic in lda_model.show_topics(formatted=False, num_words=3):
    print('{}: {}'.format(index, [w[0] for w in topic]))
  
  def make_topic_features(review_words):
    topics = lda_model.get_document_topics(dictionary.doc2bow(preprocess_words(review_words)))
    return topic_features(topics, num_topics)
  return make_topic_features

topic_features_maker = get_topic_features_maker(all_reviews)

In [None]:
features_topic = [topic_features_maker(x[1]) for x in all_reviews]

In [None]:
############################
##### DUPLICATION CELL #####
############################

from exp2_feature_extraction import find_capitalised_word_ratio
from exp2_feature_extraction import max_date_occurrences
import statistics
import functools

def reviewer_features(review, reviews_by_reviewer):
  reviews = reviews_by_reviewer[review.user_id]
  max_reviews_in_day = max_date_occurrences(reviews)
  average_review_length = functools.reduce(lambda total, review: total + len(review.review_content), reviews, 0) / len(reviews)
  ratings_stdev = 0 if len(reviews) == 1 else statistics.stdev([x.rating for x in reviews])
  return (max_reviews_in_day, average_review_length, ratings_stdev)

In [None]:
from exp2_feature_extraction import reviews_by_reviewer
reviews_reviewer_map = reviews_by_reviewer([x[0] for x in all_reviews])
features_reviewer = [reviewer_features(x[0], reviews_reviewer_map) for x in all_reviews]