In [17]:
# %load helpers.py
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.sql.functions import current_date, expr, datediff, to_date
from pyspark.sql.functions import length, regexp_replace

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import re


def get_kv_pairs(row, exclusions=[]):
    # get the text from the row entry
    text = str(row.review_body).lower()
    # create blacklist of words
    blacklist = set(stopwords.words('english'))
    # add explicit words
    [blacklist.add(i) for i in exclusions]
    # extract all words
    words = re.findall(r'([^\w+])', text)
    # for each word, send back a count of 1
    # send a list of lists
    return [[w, 1] for w in words if w not in blacklist]


def get_word_counts(texts, exclusions=[]):
    mapped_rdd = texts.rdd.flatMap(lambda row: get_kv_pairs(row, exclusions))
    counts_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b).sortByKey(True, 1)
    return counts_rdd.collect()


def convert_str_to_int(df, col='verified_purchase', type_='int'):
    return df.select((df[col] == 'Y').cast(type_))


def get_review_age(df):
    return df.select(datediff(current_date(), to_date(df['review_date'])))


def prepare_features(df):
    df = df.withColumn('exclam', length('review_body') - length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age', datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(['star_rating', 'helfulness', 'age', 'review_length', 'label']).na.fill(0)
    return select_cols


def split_data(df, rate=.9):
    training = df.sampleBy("label", fractions={0: rate, 1: rate}, seed=12)
    return training, df.subtract(training)


def get_auc_roc(classifier, training, test):
    model = classifier.fit(training)
    out = model.transform(test) \
        .select("prediction", "label") \
        .rdd.map(lambda x: (float(x[0]), float(x[1])))
    metrics = MulticlassMetrics(out)
#     print("Model: {1}. Area under ROC: {0:2f}".format(metrics.areaUnderROC, clf.__class__))
    return model, out, metrics


def get_vectorized_features(df, cols=['star_rating']):
    va = VectorAssembler().setInputCols(cols).setOutputCol(
        'features')
    return va.transform(df)

In [2]:
# %load environment.py
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

DATA_FILE = '../../amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'Prediction'
FEATURES = ['star_rating', 'review_body', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_date']
SAMPLE_SIZE = 10000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()


In [3]:
classifiers = [LogisticRegression(), NaiveBayes(), DecisionTreeClassifier(), RandomForestClassifier(),
                   GBTClassifier()]
results = []

# 10000 sample dataset

In [4]:
select_cols = prepare_features(review_sample)
features = get_vectorized_features(select_cols, cols=['star_rating', 'helfulness', 'age', 'review_length'])
training = features.sampleBy("label", fractions={0: 0.92, 1: 0.08}, seed=12)
training.groupBy("label").count().orderBy("label").show()
test = features.subtract(training)

+-----+-----+
|label|count|
+-----+-----+
|    0|  739|
|    1|  731|
+-----+-----+



In [18]:
for clf in classifiers:
    model, out, metrics = get_auc_roc(clf, training, test)
    results.append([model, out, metrics])

In [23]:
for m,o,metrics in results[-5:]:
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Weighted recall = 0.689262147570486
Weighted precision = 0.9792100730429228
Weighted F(1) Score = 0.8034419158790111
Weighted F(0.5) Score = 0.9003234429759306
Weighted false positive rate = 0.3564840961755211
Weighted recall = 0.7790441911617676
Weighted precision = 0.9791545542977921
Weighted F(1) Score = 0.8633698529601728
Weighted F(0.5) Score = 0.9292102146390998
Weighted false positive rate = 0.4115412216105321
Weighted recall = 0.5708858228354329
Weighted precision = 0.9803966893858123
Weighted F(1) Score = 0.7136342816579638
Weighted F(0.5) Score = 0.8527631351071943
Weighted false positive rate = 0.24550122973638028
Weighted recall = 0.5390921815636872
Weighted precision = 0.9807625520840806
Weighted F(1) Score = 0.6870836834289293
Weighted F(0.5) Score = 0.8374479538179295
Weighted false positive rate = 0.21778673786927583
Weighted recall = 0.5520895820835833
Weighted precision = 0.9805409767026941
Weighted F(1) Score = 0.6980855737531481
Weighted F(0.5) Score = 0.84383874876

In [32]:
m,o,metrics = results[1]
ys=list(zip(*o.collect()))

In [34]:
import scikitplot as skplt
import matplotlib.pyplot as plt

y_true = ys[0]# ground truth labels
y_probas = ys[1]# predicted probabilities generated by sklearn classifier
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()



IndexError: too many indices for array

# entire dataset

In [11]:
select_cols = prepare_features(review_all)
features = get_vectorized_features(select_cols, cols=['star_rating', 'helfulness', 'age', 'review_length'])

In [12]:
features.groupBy("label").count().orderBy("label").show()

+-----+-------+
|label|  count|
+-----+-------+
|    0| 307573|
|    1|1494401|
+-----+-------+



In [22]:
training = features.sampleBy("label", fractions={0: 0.80, 1: 0.165}, seed=24)
training.groupBy("label").count().orderBy("label").show()
test = features.subtract(training)

+-----+------+
|label| count|
+-----+------+
|    0|246383|
|    1|246651|
+-----+------+



In [23]:
for clf in classifiers:
    model, out, metrics = get_auc_roc(clf, training, test)
    results.append([model, out, metrics])

Model: <class 'pyspark.ml.classification.LogisticRegression'>. Area under ROC: 0.683608
Model: <class 'pyspark.ml.classification.NaiveBayes'>. Area under ROC: 0.555252
Model: <class 'pyspark.ml.classification.DecisionTreeClassifier'>. Area under ROC: 0.704910
Model: <class 'pyspark.ml.classification.RandomForestClassifier'>. Area under ROC: 0.700429
Model: <class 'pyspark.ml.classification.GBTClassifier'>. Area under ROC: 0.710810


In [37]:
df.columns

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']