# Setup

First obtain dependencies

In [1]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
# scikit
import sklearn
from scipy.io import loadmat
from sklearn.naive_bayes import MultinomialNB

In [3]:
# pyspark
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover

In [5]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect

[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
%load_ext autoreload
%autoreload 2

Fill in the following data

In [5]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region


# Dataset location
# s3 = #s3a address


Connect to the AWS resources

In [6]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [7]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

# Project

In [8]:
rdd = df.rdd.map(tuple)

In [9]:
# small = sc.parallelize(rdd.take(10))

## Preprocessing

Removes non-English data from the dataset

In [10]:
from langdetect import detect

def remove_nonenglish(row):
    '''
    Removes records that have invalid descriptions from the dataframe
    Input: dataframe
    Output: Cleaned up dataframe
    '''
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    
    upd_desc=''
    
    for i in range(len(desc)-1):
        upd_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            upd_desc+=' '
    
    upd_desc+=desc[-1]
    return upd_desc 

def remove_punc(row):
    desc = row[1]
    
    desc=replace_punc_with_space(desc)    
    desc=desc.lower() 
    desc = "".join([" " if char in ['.', ',', '?', '!', '(', ')', '/', ';', ':'] else char for char in desc])
    desc = "".join(["" if char in ['\''] else char for char in desc])
    
    lst = list(row)
    lst[1] = desc
    tup = tuple(lst)

    return tup


In [13]:
# small = small.filter(remove_nonenglish).map(remove_punc)

Genres converted to array

In [11]:
def genre_to_array(row):
    genres = row[10]
    glist = []
    
    if(genres is not None): glist = genres.split('|')
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

Applies the above processes to rdd

In [13]:
rdd = rdd.filter(remove_nonenglish).map(remove_punc).map(genre_to_array)

In [14]:
print(rdd.take(3))

[('Suzanne Collins', 'winning will make you famous  losing means certain death the nation of panem  formed from a post-apocalyptic north america  is a country that consists of a wealthy capitol region surrounded by 12 poorer districts  early in its history  a rebellion led by a 13th district against the capitol resulted in its destruction and the creation of an annual televised event known as the hunger games  in punishment  and as a reminder of the power and grace of the capitol  each district must yield one boy and one girl between the ages of 12 and 18 through a lottery system to participate in the games  the tributes are chosen during the annual reaping and are forced to fight to the death  leaving only one survivor to claim victory when 16-year-old katnisss young sister  prim  is selected as district 12s female representative  katniss volunteers to take her place  she and her male counterpart peeta  are pitted against bigger  stronger representatives  some of whom have trained for

Convert data to dataframe with header names and cast datatypes

In [15]:
books_df = rdd.toDF(['author', 'description', 'edition', 'format', 'isbn13', 'pages', 'rating', 'rating_count', 'review_count', 'title', 'genres', 'image_url']) \
    .drop("edition") \
    .drop("format") \
    .drop("pages") \
    .drop("isbn13") \
    .drop("review_count") \
    .drop("image_url") \
    .drop("rating_count")

books_df = books_df.withColumn("rating", books_df["rating"].cast("decimal(3,2)"))
#    .withColumn("rating_count", books_df["rating_count"].cast("long"))

books_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- rating_count: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [16]:
books_df.show(n=5)

+--------------------+--------------------+------+------------+--------------------+--------------------+
|              author|         description|rating|rating_count|               title|              genres|
+--------------------+--------------------+------+------------+--------------------+--------------------+
|     Suzanne Collins|winning will make...|  4.33|     5519135|    The Hunger Games|[Young Adult, Fic...|
|J.K. Rowling|Mary...|there is a door a...|  4.48|     2041594|Harry Potter and ...|[Fantasy, Young A...|
|          Harper Lee|the unforgettable...|  4.27|     3745197|To Kill a Mocking...|[Classics, Fictio...|
|     Stephenie Meyer|about three thing...|  3.58|     4281268|            Twilight|[Young Adult, Fan...|
|        Markus Zusak|trying to make se...|  4.36|     1485632|      The Book Thief|[Historical, Hist...|
+--------------------+--------------------+------+------------+--------------------+--------------------+
only showing top 5 rows



### TF/IDF

Sources to learn more:

* https://spark.apache.org/docs/latest/ml-features#tf-idf
* https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
* https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf#scikit-learn-settings

In [17]:
books_df = RegexTokenizer(inputCol="description", outputCol="desc_words", pattern="\\W").transform(books_df)
books_df = StopWordsRemover(inputCol="desc_words", outputCol="desc_filtered").transform(books_df)

books_df[("description", "desc_words", "desc_filtered")].show(3)

+--------------------+--------------------+--------------------+
|         description|          desc_words|       desc_filtered|
+--------------------+--------------------+--------------------+
|winning will make...|[winning, will, m...|[winning, make, f...|
|there is a door a...|[there, is, a, do...|[door, end, silen...|
|the unforgettable...|[the, unforgettab...|[unforgettable, n...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [18]:
books_df = HashingTF(inputCol="desc_filtered", outputCol="tf").transform(books_df)
books_df[("title", "desc_filtered", "tf")].show(n=3)

+--------------------+--------------------+--------------------+
|               title|       desc_filtered|                  tf|
+--------------------+--------------------+--------------------+
|    The Hunger Games|[winning, make, f...|(262144,[1882,271...|
|Harry Potter and ...|[door, end, silen...|(262144,[9916,110...|
|To Kill a Mocking...|[unforgettable, n...|(262144,[10102,13...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



Note: this take some amount of time to complete

In [20]:
books_df = IDF(inputCol="tf", outputCol="idf").fit(books_df).transform(books_df)
books_df[("title", "desc_filtered", "tf", "idf")].show(n=3)

+--------------------+--------------------+--------------------+--------------------+
|               title|       desc_filtered|                  tf|                 idf|
+--------------------+--------------------+--------------------+--------------------+
|    The Hunger Games|[winning, make, f...|(262144,[1882,271...|(262144,[1882,271...|
|Harry Potter and ...|[door, end, silen...|(262144,[9916,110...|(262144,[9916,110...|
|To Kill a Mocking...|[unforgettable, n...|(262144,[10102,13...|(262144,[10102,13...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



### Training and Testing datsets

In [26]:
train, test = books_df.randomSplit([0.7, 0.3])

train.cache()
test.cache()

DataFrame[author: string, description: string, rating: decimal(3,2), rating_count: bigint, title: string, genres: array<string>, desc_words: array<string>, desc_filtered: array<string>, tf: vector, idf: vector]

In [31]:
train_description = train.select('description').collect()
test_description = test.select('description').collect()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:43163)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.2.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:43163)

In [None]:
train_rating = train.select('rating').collect()
test_rating = test.select('rating').collect()

In [None]:
train_genre = train.select('genres').collect()
test_genre = test.select('genres').collect()

### Bayes Classification

In [None]:
model = MultinomialNB().fit(train_desc, train_rating)
score = model.score(test_desc, test_rating)
score