# Setup

First obtain dependencies

In [1]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
# pyspark
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover, QuantileDiscretizer

In [3]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect

[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
%load_ext autoreload
%autoreload 2

Fill in the following data

In [5]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region


# Dataset location
# s3 = #s3a address

Connect to the AWS resources

In [6]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [7]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

# Project

In [8]:
rdd = df.rdd.map(tuple)

In [9]:
# small = sc.parallelize(rdd.take(10))

## Preprocessing

Removes non-English data from the dataset

In [10]:
from langdetect import detect

def remove_nonenglish(row):
    '''
    Removes records that have invalid descriptions from the dataframe
    Input: dataframe
    Output: Cleaned up dataframe
    '''
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    
    upd_desc=''
    
    for i in range(len(desc)-1):
        upd_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            upd_desc+=' '
    
    upd_desc+=desc[-1]
    return upd_desc 

def remove_punc(row):
    desc = row[1]
    
    desc=replace_punc_with_space(desc)    
    desc=desc.lower() 
    desc = "".join([" " if char in ['.', ',', '?', '!', '(', ')', '/', ';', ':'] else char for char in desc])
    desc = "".join(["" if char in ['\''] else char for char in desc])
    
    lst = list(row)
    lst[1] = desc
    tup = tuple(lst)

    return tup


In [11]:
# small = small.filter(remove_nonenglish).map(remove_punc)

Genres converted to array

In [12]:
def genre_to_array(row):
    genres = row[10]
    glist = []
    
    if(genres is not None): glist = genres.split('|')
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

Applies the above processes to rdd

In [13]:
rdd = rdd.filter(remove_nonenglish).map(remove_punc).map(genre_to_array)

In [14]:
print(rdd.take(3))

[('Suzanne Collins', 'winning will make you famous  losing means certain death the nation of panem  formed from a post-apocalyptic north america  is a country that consists of a wealthy capitol region surrounded by 12 poorer districts  early in its history  a rebellion led by a 13th district against the capitol resulted in its destruction and the creation of an annual televised event known as the hunger games  in punishment  and as a reminder of the power and grace of the capitol  each district must yield one boy and one girl between the ages of 12 and 18 through a lottery system to participate in the games  the tributes are chosen during the annual reaping and are forced to fight to the death  leaving only one survivor to claim victory when 16-year-old katnisss young sister  prim  is selected as district 12s female representative  katniss volunteers to take her place  she and her male counterpart peeta  are pitted against bigger  stronger representatives  some of whom have trained for

Convert data to dataframe with header names and cast datatypes

In [15]:
books_df = rdd.toDF(['author', 'description', 'edition', 'format', 'isbn13', 'pages', 'rating', 'rating_count', 'review_count', 'title', 'genres', 'image_url']) \
    .drop("edition") \
    .drop("format") \
    .drop("pages") \
    .drop("isbn13") \
    .drop("review_count") \
    .drop("image_url") \
    .drop("rating_count")

books_df = books_df.withColumn("rating", books_df["rating"].cast("decimal(3,2)"))
books_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [16]:
books_df = books_df.dropna(subset=('description', 'rating'))

### Bucket Ratings

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.QuantileDiscretizer

Current method is with **QuantileDiscretizer**, which buckets by frequency (equal number in each bucket).

Adjust **numBuckets** to change the bucket sizes for the ratings and get different outcomes. The lower number of buckets, the more book ratings per bucket.

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer

Alternative is to use **Bucketizer**, which buckets by length (or range). Adjust the numbers in the range to fix the outcome.

Sample code:

```
from pyspark.ml.feature import Bucketizer

split = [0,0.25,0.5,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3,3.25,3.5,3.75,4,4.25,4.5,4.75,5]
bucketizer = Bucketizer(splits=split, inputCol='rating', outputCol='label')
books_df = bucketizer.transform(books_df)
```

In [17]:
discretizer = QuantileDiscretizer(numBuckets=10, inputCol='rating', outputCol='label')
books_df = discretizer.fit(books_df).transform(books_df)

### TF/IDF

Sources to learn more:

* https://spark.apache.org/docs/latest/ml-features#tf-idf
* https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
* https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf#scikit-learn-settings

In [18]:
books_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = true)



In [19]:
tokenizer = RegexTokenizer(inputCol="description", outputCol="descToken", pattern="\\W")
books_df = tokenizer.transform(books_df)

swremover = StopWordsRemover(inputCol="descToken", outputCol="desc")
books_df = swremover.transform(books_df)

books_df[("description", "descToken", "desc")].show(3)

+--------------------+--------------------+--------------------+
|         description|           descToken|                desc|
+--------------------+--------------------+--------------------+
|winning will make...|[winning, will, m...|[winning, make, f...|
|there is a door a...|[there, is, a, do...|[door, end, silen...|
|the unforgettable...|[the, unforgettab...|[unforgettable, n...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF

**numFeatures should be adjusted to get better outcomes**

> Since a simple modulo is used to transform the hash function to a column index, it is advisable to use a power of two as the numFeatures parameter; otherwise the features will not be mapped evenly to the columns.

In [20]:
hashingTF = HashingTF(inputCol="desc", outputCol="raw_features", numFeatures=32)
featurized_data = hashingTF.transform(books_df)

In [21]:
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)

In [22]:
rescaled_data.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = true)
 |-- descToken: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- desc: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)



In [23]:
rescaled_data.select("title", "rating", "label", "features").show(10)

+--------------------+------+-----+--------------------+
|               title|rating|label|            features|
+--------------------+------+-----+--------------------+
|    The Hunger Games|  4.33|  8.0|(30,[0,1,2,3,4,5,...|
|Harry Potter and ...|  4.48|  9.0|(30,[0,1,2,3,4,5,...|
|To Kill a Mocking...|  4.27|  8.0|(30,[0,1,3,4,5,6,...|
|            Twilight|  3.58|  0.0|(30,[0,3,5,7,9,10...|
|      The Book Thief|  4.36|  8.0|(30,[0,1,2,3,4,5,...|
|The Chronicles of...|  4.25|  7.0|(30,[0,2,3,4,5,6,...|
|  Gone with the Wind|  4.29|  8.0|(30,[0,1,2,3,5,6,...|
|The Fault in Our ...|  4.24|  7.0|(30,[0,1,3,4,5,6,...|
|   Wuthering Heights|  3.84|  2.0|(30,[0,1,2,3,4,6,...|
|   The Da Vinci Code|  3.81|  2.0|(30,[0,1,2,3,4,5,...|
+--------------------+------+-----+--------------------+
only showing top 10 rows



### Training and Testing datsets

In [24]:
(trainingData, testData) = rescaled_data.randomSplit([0.7, 0.3])

In [25]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

### Bayes Classification

In [26]:
from pyspark.ml.classification import NaiveBayes

In [None]:
nb = NaiveBayes(labelCol="label", featuresCol="features", modelType="multinomial",  smoothing=1.0)
nbModel = nb.fit(trainingData)

In [None]:
nbPredictions = nbModel.transform(testData)
nbPredictions.select("prediction", "label", "features").show(10)

### Descision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dtModel = dt.fit(trainingData)
dtPredictions = dtModel.transform(testData)
dtPredictions.select("prediction", "label", "features").show(10)

In [None]:
dtAccuracy = evaluator.evaluate(dtPredictions)
print(dtAccuracy)