In [7]:
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import length, current_date, expr, datediff, regexp_replace, months_between, to_date

DATA_FILE = '../../amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'NLP'
FEATURES = ['star_rating', 'review_body', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_date']
SAMPLE_SIZE = 1000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)
df.createOrReplaceTempView("eda_sql_view")

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()


In [8]:
# %load helpers.py
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql.functions import current_date, expr, datediff, to_date
from pyspark.sql.functions import length, regexp_replace

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import re


def get_kv_pairs(row, exclusions=[],pat='([^\w+])'):
    # get the text from the row entry
    text = str(row.review_body).lower()
    # create blacklist of words
    blacklist = set(stopwords.words('english'))
    # add explicit words
    [blacklist.add(i) for i in exclusions]
    # extract all words
    words = re.findall(r'{0}'.format(pat), text)
    # for each word, send back a count of 1
    # send a list of lists
    return [[w, 1] for w in words if w not in blacklist]


def get_word_counts(texts, exclusions=[],pat='([^\w+])'):
    mapped_rdd = texts.rdd.flatMap(lambda row: get_kv_pairs(row, exclusions,pat))
    counts_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1])
    return counts_rdd.collect()


def convert_str_to_int(df, col='verified_purchase', type_='int'):
    return df.select((df[col] == 'Y').cast(type_))


def get_review_age(df):
    return df.select(datediff(current_date(), to_date(df['review_date'])))


def prepare_features(df):
    df = df.withColumn('exclam', length('review_body') - length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age', datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(['star_rating', 'helfulness', 'age', 'review_length', 'label']).na.fill(0)
    return select_cols


def split_data(df, rate=.9):
    training = df.sampleBy("label", fractions={0: rate, 1: rate}, seed=12)
    return training, df.subtract(training)


def get_auc_roc(classifier, training, test):
    model = classifier.fit(training)
    out = model.transform(test) \
        .select("prediction", "label") \
        .rdd.map(lambda x: (float(x[0]), float(x[1])))
    metrics = BinaryClassificationMetrics(out)
    print("Model: {1}. Area under ROC: {0:2f}".format(metrics.areaUnderROC, clf.__class__))
    return model, out, metrics


def get_vectorized_features(df, cols=['star_rating']):
    va = VectorAssembler().setInputCols(cols).setOutputCol(
        'features')
    return va.transform(df)

In [13]:
list(zip(*[('camera', 1639956),
 ('great', 680334),
 ('lens', 603607),
 ('use', 560981),
 ('one', 559507),
 ('good', 546468),
 ('would', 432486),
 ('like', 400845),
 ('well', 388333),
 ('get', 383928),
 ('quality', 383196),
 ('battery', 372988),
 ('price', 308681),
 ('works', 297720),
 ('pictures', 295809),
 ('time', 294442),
 ('product', 289350),
 ('also', 267936),
 ('canon', 260621),
 ('really', 259896),
 ('light', 253978),
 ('easy', 252998),
 ('video', 244043),
 ('much', 238650),
 ('even', 231237),
 ('work', 231198),
 ('bought', 230853),
 ('used', 224695),
 ('little', 219582),
 ('case', 218638),
 ('take', 211386),
 ('better', 210599),
 ('mm', 204374),
 ('cameras', 200888),
 ('bag', 199314),
 ('batteries', 196835),
 ('using', 194472),
 ('buy', 193688),
 ('need', 192535),
 ('still', 190584),
 ('nice', 190259),
 ('flash', 187135),
 ('love', 177339),
 ('small', 169254),
 ('got', 168523),
 ('back', 167576),
 ('want', 164729),
 ('could', 164134),
 ('nikon', 162731),
 ('first', 162261),
 ('picture', 158769)]))

[('camera',
  'great',
  'lens',
  'use',
  'one',
  'good',
  'would',
  'like',
  'well',
  'get',
  'quality',
  'battery',
  'price',
  'works',
  'pictures',
  'time',
  'product',
  'also',
  'canon',
  'really',
  'light',
  'easy',
  'video',
  'much',
  'even',
  'work',
  'bought',
  'used',
  'little',
  'case',
  'take',
  'better',
  'mm',
  'cameras',
  'bag',
  'batteries',
  'using',
  'buy',
  'need',
  'still',
  'nice',
  'flash',
  'love',
  'small',
  'got',
  'back',
  'want',
  'could',
  'nikon',
  'first',
  'picture'),
 (1639956,
  680334,
  603607,
  560981,
  559507,
  546468,
  432486,
  400845,
  388333,
  383928,
  383196,
  372988,
  308681,
  297720,
  295809,
  294442,
  289350,
  267936,
  260621,
  259896,
  253978,
  252998,
  244043,
  238650,
  231237,
  231198,
  230853,
  224695,
  219582,
  218638,
  211386,
  210599,
  204374,
  200888,
  199314,
  196835,
  194472,
  193688,
  192535,
  190584,
  190259,
  187135,
  177339,
  169254,
  168523

In [11]:
texts=review_all.select('review_body')
wc=get_word_counts(texts,exclusions=['br'],pat='([a-z]+)')
wc[::-1]

[('camera', 1639956),
 ('great', 680334),
 ('lens', 603607),
 ('use', 560981),
 ('one', 559507),
 ('good', 546468),
 ('would', 432486),
 ('like', 400845),
 ('well', 388333),
 ('get', 383928),
 ('quality', 383196),
 ('battery', 372988),
 ('price', 308681),
 ('works', 297720),
 ('pictures', 295809),
 ('time', 294442),
 ('product', 289350),
 ('also', 267936),
 ('canon', 260621),
 ('really', 259896),
 ('light', 253978),
 ('easy', 252998),
 ('video', 244043),
 ('much', 238650),
 ('even', 231237),
 ('work', 231198),
 ('bought', 230853),
 ('used', 224695),
 ('little', 219582),
 ('case', 218638),
 ('take', 211386),
 ('better', 210599),
 ('mm', 204374),
 ('cameras', 200888),
 ('bag', 199314),
 ('batteries', 196835),
 ('using', 194472),
 ('buy', 193688),
 ('need', 192535),
 ('still', 190584),
 ('nice', 190259),
 ('flash', 187135),
 ('love', 177339),
 ('small', 169254),
 ('got', 168523),
 ('back', 167576),
 ('want', 164729),
 ('could', 164134),
 ('nikon', 162731),
 ('first', 162261),
 ('picture',

In [6]:
texts=review_all.select('review_body')
wc=get_word_counts(texts)
wc[::-1]

[(' ', 141087569),
 ('.', 9576990),
 (',', 4700929),
 ("'", 2197025),
 ('/', 2048870),
 ('>', 1727926),
 ('<', 1722520),
 ('-', 1496910),
 ('!', 840695),
 (')', 676473),
 ('\\', 639746),
 ('(', 624178),
 ('"', 319309),
 (':', 296622),
 (';', 256400),
 ('&', 243994),
 ('#', 164939),
 ('$', 159892),
 ('?', 117602),
 ('*', 105715),
 (']', 77115),
 ('[', 76766),
 ('=', 42634),
 ('%', 34207),
 ('’', 15690),
 ('~', 10967),
 ('@', 5351),
 ('\xa0', 4293),
 ('`', 3555),
 ('´', 3511),
 ('\x1a', 3411),
 ('…', 2739),
 ('”', 2715),
 ('“', 2220),
 ('–', 1786),
 ('•', 1511),
 ('°', 1167),
 ('®', 938),
 ('^', 932),
 ('|', 873),
 ('—', 861),
 ('👍', 655),
 ('►', 621),
 ('}', 563),
 ('{', 526),
 ('♦', 365),
 ('‘', 321),
 ('★', 305),
 ('¡', 282),
 ('\u200b', 274),
 ('·', 273),
 ('️', 261),
 ('£', 236),
 ('×', 213),
 ('✔', 175),
 ('◆', 174),
 ('！', 168),
 ('，', 167),
 ('¦', 152),
 ('⭐', 140),
 ('€', 128),
 ('✅', 126),
 ('😀', 120),
 ('♥', 119),
 ('™', 113),
 ('😊', 112),
 ('✓', 110),
 ('😃', 108),
 ('░', 108)