# Setup

First obtain dependencies

In [1]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
# scikit
import sklearn
from scipy.io import loadmat
from sklearn.naive_bayes import MultinomialNB

In [3]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect
!{sys.executable} -m pip install nltk

[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
# %load_ext autoreload
# %autoreload 2
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Fill in the following data

In [6]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region



# Dataset location
# s3 = #S3 dataset URL

Connect to the AWS resources

In [7]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [8]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

Test to confirm the dataset was obtained

# Project

In [10]:
rdd = df.rdd.map(tuple)

## Preprocessing

In [9]:
from langdetect import detect

def remove_nonenglish(row):
    # Returns True if the tuple's description is written in English, false otherwise    
    
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    # Returns an updated description string with punctuation directly between two letters replaced with a space
    
    new_desc=''
    
    for i in range(len(desc)-1):
        new_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            new_desc+=' '
    
    new_desc+=desc[-1]
    
    return new_desc 

In [11]:
import string

def clean_words(row):
    # Returns tuple with description cleaned 
    # Removes punctuation, tokenizes words, stems them for comparision, filters out stop words
    
    desc = row[1] 
    
    desc = replace_punc_with_space(desc) #Some words in descriptions are not separated by a space, but with punctuation
    desc = desc.lower() #make all lowercase for easy comparing
    
    # split into words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(desc)
    
    # remove punctuation from each word
    punc = str.maketrans('', '', string.punctuation)
    no_punc = [word.translate(punc) for word in words]
    
    # remove remaining tokens that are not alphabetic
    words_alpha = [word for word in no_punc if word.isalpha()]
    
    
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words_alpha if not w in stop_words]    
    
    # stem the words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    
    lst = list(row)
    lst[1] = words
    tup = tuple(lst)

    return tup
    

In [13]:
def genre_to_array(row):
    # Returns tuple with genres turned into an array
    
    genres = row[10]
    glist = []
    if(genres is not None): glist = genres.split('|')
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

In [14]:
rdd_filtered = rdd.filter(remove_nonenglish).map(clean_words).map(genre_to_array)

In [15]:
rdd_filtered.take(3)

[('Suzanne Collins',
  ['winning',
   'make',
   'famous',
   'losing',
   'means',
   'certain',
   'deaththe',
   'nation',
   'panem',
   'formed',
   'postapocalyptic',
   'north',
   'america',
   'country',
   'consists',
   'wealthy',
   'capitol',
   'region',
   'surrounded',
   'poorer',
   'districts',
   'early',
   'history',
   'rebellion',
   'led',
   'district',
   'capitol',
   'resulted',
   'destruction',
   'creation',
   'annual',
   'televised',
   'event',
   'known',
   'hunger',
   'games',
   'punishment',
   'reminder',
   'power',
   'grace',
   'capitol',
   'district',
   'must',
   'yield',
   'one',
   'boy',
   'one',
   'girl',
   'ages',
   'lottery',
   'system',
   'participate',
   'games',
   'tributes',
   'chosen',
   'annual',
   'reaping',
   'forced',
   'fight',
   'death',
   'leaving',
   'one',
   'survivor',
   'claim',
   'victorywhen',
   'katniss',
   'young',
   'sister',
   'prim',
   'selected',
   'district',
   'female',
   'rep

### TF/IDF

One thing that will need to happen is to use regex to transform the genres into a dataframe

In [68]:
header_df = df.schema.names
header_df

# TO DO
# SEE ERROR BELOW

['book_authors',
 'book_desc',
 'book_edition',
 'book_format',
 'book_isbn',
 'book_pages',
 'book_rating',
 'book_rating_count',
 'book_review_count',
 'book_title',
 'genres',
 'image_url']

### Training and Testing datsets

Currently using ~10% of the dataset (can be adjusted).

In [66]:
# train, test = df.randomSplit([0.1, 0.9])

# train_desc = train.select('book_desc').collect()
# train_genre = train.select('genres').collect()

# test_desc = test.select('book_desc').collect()
# test_genre = test.select('genres').collect()

### Bayes Classification

In [3]:
# model = MultinomialNB().fit(train_desc, train_genre)
# score = model.score(test_desc, test_genre)
# score