In [1]:
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import length, current_date, expr, datediff, regexp_replace, months_between, to_date

DATA_FILE = '../../amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'NLP'
FEATURES = ['star_rating', 'review_body', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_date']
SAMPLE_SIZE = 10000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "FAILFAST") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)
df.createOrReplaceTempView("eda_sql_view")

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()


In [18]:
# %load helpers.py
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql.functions import current_date, expr, datediff, to_date
from pyspark.sql.functions import length, regexp_replace

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import re


def get_kv_pairs(row, exclusions=[]):
    # get the text from the row entry
    text = str(row.review_body).lower()
    # create blacklist of words
    blacklist = set(stopwords.words('english'))
    # add explicit words
    [blacklist.add(i) for i in exclusions]
    # extract all words
    words = re.findall(r'([^\w+])', text)
    # for each word, send back a count of 1
    # send a list of lists
    return [[w, 1] for w in words if w not in blacklist]


def get_word_counts(texts, exclusions=[]):
    mapped_rdd = texts.rdd.flatMap(lambda row: get_kv_pairs(row, exclusions))
    counts_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1])
    return counts_rdd.collect()


def convert_str_to_int(df, col='verified_purchase', type_='int'):
    return df.select((df[col] == 'Y').cast(type_))


def get_review_age(df):
    return df.select(datediff(current_date(), to_date(df['review_date'])))


def prepare_features(df):
    df = df.withColumn('exclam', length('review_body') - length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age', datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(['star_rating', 'helfulness', 'age', 'review_length', 'label']).na.fill(0)
    return select_cols


def split_data(df, rate=.9):
    training = df.sampleBy("label", fractions={0: rate, 1: rate}, seed=12)
    return training, df.subtract(training)


def get_auc_roc(classifier, training, test):
    model = classifier.fit(training)
    out = model.transform(test) \
        .select("prediction", "label") \
        .rdd.map(lambda x: (float(x[0]), float(x[1])))
    metrics = BinaryClassificationMetrics(out)
    print("Model: {1}. Area under ROC: {0:2f}".format(metrics.areaUnderROC, clf.__class__))
    return model, out, metrics


def get_vectorized_features(df, cols=['star_rating']):
    va = VectorAssembler().setInputCols(cols).setOutputCol(
        'features')
    return va.transform(df)

In [19]:
texts=review_sample.select('review_body')
wc=get_word_counts(texts)
wc

[('😀', 2),
 ('®', 2),
 ('^', 2),
 ('✏', 3),
 ('📍', 3),
 ('⭐', 3),
 ('\x1a', 3),
 ('♡', 4),
 ('}', 5),
 ('❖', 5),
 ('{', 6),
 ('°', 6),
 ('️', 8),
 ('👍', 8),
 ('—', 8),
 ('`', 9),
 ('…', 10),
 ('░', 10),
 ('´', 13),
 ('@', 14),
 ('•', 18),
 ('~', 19),
 ('–', 36),
 ('“', 46),
 ('”', 74),
 ('"', 75),
 ('%', 115),
 ('=', 117),
 ('\\', 164),
 ('[', 187),
 (']', 187),
 ('’', 290),
 ('*', 386),
 ('?', 387),
 ('$', 419),
 (':', 1070),
 ('#', 1407),
 ('(', 1631),
 (';', 1637),
 ('&', 1658),
 (')', 1736),
 ('-', 3644),
 ('!', 4067),
 ('<', 5597),
 ('>', 5597),
 ('/', 6779),
 ("'", 6973),
 (',', 14070),
 ('.', 32871),
 (' ', 457860)]

In [22]:
wc[::-1]

[(' ', 457860),
 ('.', 32871),
 (',', 14070),
 ("'", 6973),
 ('/', 6779),
 ('>', 5597),
 ('<', 5597),
 ('!', 4067),
 ('-', 3644),
 (')', 1736),
 ('&', 1658),
 (';', 1637),
 ('(', 1631),
 ('#', 1407),
 (':', 1070),
 ('$', 419),
 ('?', 387),
 ('*', 386),
 ('’', 290),
 (']', 187),
 ('[', 187),
 ('\\', 164),
 ('=', 117),
 ('%', 115),
 ('"', 75),
 ('”', 74),
 ('“', 46),
 ('–', 36),
 ('~', 19),
 ('•', 18),
 ('@', 14),
 ('´', 13),
 ('░', 10),
 ('…', 10),
 ('`', 9),
 ('—', 8),
 ('👍', 8),
 ('️', 8),
 ('°', 6),
 ('{', 6),
 ('❖', 5),
 ('}', 5),
 ('♡', 4),
 ('\x1a', 3),
 ('⭐', 3),
 ('📍', 3),
 ('✏', 3),
 ('^', 2),
 ('®', 2),
 ('😀', 2),
 ('📷', 2),
 ('›', 2),
 ('🌟', 2),
 ('\xa0', 2),
 ('‘', 2),
 ('😊', 2),
 ('😄', 1),
 ('😕', 1),
 ('☺', 1),
 ('🏽', 1),
 ('🏾', 1),
 ('👏', 1),
 ('💘', 1),
 ('😍', 1),
 ('❤', 1),
 ('\u200b', 1),
 ('🎁', 1),
 ('😣', 1),
 ('😁', 1),
 ('🏻', 1),
 ('👌', 1),
 ('👎', 1),
 ('∞', 1),
 ('♥', 1),
 ('，', 1)]