# Setup

First obtain dependencies

In [2]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [3]:
# scikit
import sklearn
from scipy.io import loadmat
from sklearn.naive_bayes import MultinomialNB

In [4]:
# pyspark
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql.types import DoubleType

In [5]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect

[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
%load_ext autoreload
%autoreload 2

Fill in the following data

In [7]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region


# Dataset location
# s3 = #s3a address

Connect to the AWS resources

In [8]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [9]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

# Project

In [11]:
rdd = df.rdd.map(tuple)

In [12]:
small = sc.parallelize(rdd.take(10))

## Preprocessing

Removes non-English data from the dataset

In [13]:
from langdetect import detect

def remove_nonenglish(row):
    '''
    Removes records that have invalid descriptions from the dataframe
    Input: dataframe
    Output: Cleaned up dataframe
    '''
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    
    upd_desc=''
    
    for i in range(len(desc)-1):
        upd_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            upd_desc+=' '
    
    upd_desc+=desc[-1]
    return upd_desc 

def remove_punc(row):
    desc = row[1]
    
    desc=replace_punc_with_space(desc)    
    desc=desc.lower() 
    desc = "".join([" " if char in ['.', ',', '?', '!', '(', ')', '/', ';', ':'] else char for char in desc])
    desc = "".join(["" if char in ['\''] else char for char in desc])
    
    lst = list(row)
    lst[1] = desc
    tup = tuple(lst)

    return tup


In [14]:
small = small.filter(remove_nonenglish).map(remove_punc)

Genres, description, and authors converted to arrays

In [15]:
def genre_to_array(row):
    genres = row[10]
    glist = []
    if(genres is not None): glist = genres.split('|')
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

In [16]:
def authors_to_array(row):
    authors = row[0]
    authorList = []
    if(authors is not None): authorList = authors.split('|')
    
    lst = list(row)
    lst[0] = authorList
    tup = tuple(lst)

    return tup

In [17]:
def description_to_array(row):
    description = row[1]
    descriptionList = []
    if(description is not None): descriptionList = description.split(' ')
    
    lst = list(row)
    lst[1] = descriptionList
    tup = tuple(lst)

    return tup

Applies the above processes to rdd

In [18]:
rdd_filtered = rdd.filter(remove_nonenglish).map(remove_punc).map(genre_to_array).map(authors_to_array).map(description_to_array)

In [19]:
print(rdd_filtered.take(3))

[(['Suzanne Collins'], ['winning', 'will', 'make', 'you', 'famous', '', 'losing', 'means', 'certain', 'death', 'the', 'nation', 'of', 'panem', '', 'formed', 'from', 'a', 'post-apocalyptic', 'north', 'america', '', 'is', 'a', 'country', 'that', 'consists', 'of', 'a', 'wealthy', 'capitol', 'region', 'surrounded', 'by', '12', 'poorer', 'districts', '', 'early', 'in', 'its', 'history', '', 'a', 'rebellion', 'led', 'by', 'a', '13th', 'district', 'against', 'the', 'capitol', 'resulted', 'in', 'its', 'destruction', 'and', 'the', 'creation', 'of', 'an', 'annual', 'televised', 'event', 'known', 'as', 'the', 'hunger', 'games', '', 'in', 'punishment', '', 'and', 'as', 'a', 'reminder', 'of', 'the', 'power', 'and', 'grace', 'of', 'the', 'capitol', '', 'each', 'district', 'must', 'yield', 'one', 'boy', 'and', 'one', 'girl', 'between', 'the', 'ages', 'of', '12', 'and', '18', 'through', 'a', 'lottery', 'system', 'to', 'participate', 'in', 'the', 'games', '', 'the', 'tributes', 'are', 'chosen', 'during

Convert data to dataframe with header names and cast datatypes

In [20]:
books_df = rdd_filtered.toDF(['author', 'description', 'edition', 'format', 'isbn13', 'pages', 'rating', 'rating_count', 'review_count', 'title', 'genres', 'image_url']) \
    .drop("edition") \
    .drop("format") \
    .drop("pages") \
    .drop("isbn13") \
    .drop("review_count") \
    .drop("image_url")

books_df = books_df.withColumn("rating", books_df["rating"].cast("decimal(3,2)")) \
    .withColumn("rating_count", books_df["rating_count"].cast("long"))

books_df.printSchema()

root
 |-- author: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- rating_count: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [21]:
books_df.show(n=5)

+--------------------+--------------------+------+------------+--------------------+--------------------+
|              author|         description|rating|rating_count|               title|              genres|
+--------------------+--------------------+------+------------+--------------------+--------------------+
|   [Suzanne Collins]|[winning, will, m...|  4.33|     5519135|    The Hunger Games|[Young Adult, Fic...|
|[J.K. Rowling, Ma...|[there, is, a, do...|  4.48|     2041594|Harry Potter and ...|[Fantasy, Young A...|
|        [Harper Lee]|[the, unforgettab...|  4.27|     3745197|To Kill a Mocking...|[Classics, Fictio...|
|   [Stephenie Meyer]|[about, three, th...|  3.58|     4281268|            Twilight|[Young Adult, Fan...|
|      [Markus Zusak]|[trying, to, make...|  4.36|     1485632|      The Book Thief|[Historical, Hist...|
+--------------------+--------------------+------+------------+--------------------+--------------------+
only showing top 5 rows



### TF/IDF

Sources to learn more:

* https://spark.apache.org/docs/latest/ml-features#tf-idf
* https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
* https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf#scikit-learn-settings

In [22]:
htf = HashingTF(inputCol="description", outputCol="tf")
tf = htf.transform(books_df)
tf.show(n=3)

+--------------------+--------------------+------+------------+--------------------+--------------------+--------------------+
|              author|         description|rating|rating_count|               title|              genres|                  tf|
+--------------------+--------------------+------+------------+--------------------+--------------------+--------------------+
|   [Suzanne Collins]|[winning, will, m...|  4.33|     5519135|    The Hunger Games|[Young Adult, Fic...|(262144,[1882,271...|
|[J.K. Rowling, Ma...|[there, is, a, do...|  4.48|     2041594|Harry Potter and ...|[Fantasy, Young A...|(262144,[426,9639...|
|        [Harper Lee]|[the, unforgettab...|  4.27|     3745197|To Kill a Mocking...|[Classics, Fictio...|(262144,[9639,101...|
+--------------------+--------------------+------+------------+--------------------+--------------------+--------------------+
only showing top 3 rows



Note: this takes a long time to complete. My internet may be bad right now or AWS may have a slowdown, but this took 5-10 minut

In [None]:
idf = IDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(n=3)

### Training and Testing datsets

In [24]:
train, test = tfidf.randomSplit([0.7, 0.3])

In [None]:
train_description = train.select('description').collect()
train_rating = train.select('rating').collect()

test_description = test.select('description').collect()
test_rating = test.select('rating').collect()

In [None]:
train_genre = train.select('genres').collect()
test_genre = test.select('genres').collect()

### Bayes Classification

In [3]:
model = MultinomialNB().fit(train_description, train_rating)
score = model.score(test_description, test_rating)
score