# Setup

First obtain dependencies

In [2]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [3]:
# scikit
import sklearn
from scipy.io import loadmat
from sklearn.naive_bayes import MultinomialNB

In [4]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect

[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [5]:
%load_ext autoreload
%autoreload 2

Fill in the following data

In [6]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region



Connect to the AWS resources

In [7]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [8]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

Test to confirm the dataset was obtained

In [9]:
smalldf = df.take(3)
smalldf

[Row(book_authors='Suzanne Collins', book_desc="Winning will make you famous. Losing means certain death.The nation of Panem, formed from a post-apocalyptic North America, is a country that consists of a wealthy Capitol region surrounded by 12 poorer districts. Early in its history, a rebellion led by a 13th district against the Capitol resulted in its destruction and the creation of an annual televised event known as the Hunger Games. In punishment, and as a reminder of the power and grace of the Capitol, each district must yield one boy and one girl between the ages of 12 and 18 through a lottery system to participate in the games. The 'tributes' are chosen during the annual Reaping and are forced to fight to the death, leaving only one survivor to claim victory.When 16-year-old Katniss's young sister, Prim, is selected as District 12's female representative, Katniss volunteers to take her place. She and her male counterpart Peeta, are pitted against bigger, stronger representatives,

# Project

In [10]:
rdd = df.rdd.map(tuple)

In [11]:
small = sc.parallelize(rdd.take(10))

## Preprocessing

In [12]:
from langdetect import detect

def remove_nonenglish(row):
    '''
    Removes records that have invalid descriptions from the dataframe
    Input: dataframe
    Output: Cleaned up dataframe
    '''
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    
    upd_desc=''
    
    for i in range(len(desc)-1):
        upd_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            upd_desc+=' '
    
    upd_desc+=desc[-1]
    return upd_desc 

def remove_punc(row):
    desc = row[1]
    
    desc=replace_punc_with_space(desc)    
    desc=desc.lower() 
    desc = "".join([" " if char in ['.', ',', '?', '!', '(', ')', '/', ';', ':'] else char for char in desc])
    desc = "".join(["" if char in ['\''] else char for char in desc])
    
    lst = list(row)
    lst[1] = desc
    tup = tuple(lst)

    return tup


    
    

In [13]:
small = small.filter(remove_nonenglish).map(remove_punc)


In [14]:
def genre_to_array(row):
    genres = row[10]
    glist = []
    if(genres is not None): glist = genres.split('|')
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

In [15]:
small = small.map(genre_to_array)

In [16]:
small.collect()

[('Suzanne Collins',
  'winning will make you famous  losing means certain death the nation of panem  formed from a post-apocalyptic north america  is a country that consists of a wealthy capitol region surrounded by 12 poorer districts  early in its history  a rebellion led by a 13th district against the capitol resulted in its destruction and the creation of an annual televised event known as the hunger games  in punishment  and as a reminder of the power and grace of the capitol  each district must yield one boy and one girl between the ages of 12 and 18 through a lottery system to participate in the games  the tributes are chosen during the annual reaping and are forced to fight to the death  leaving only one survivor to claim victory when 16-year-old katnisss young sister  prim  is selected as district 12s female representative  katniss volunteers to take her place  she and her male counterpart peeta  are pitted against bigger  stronger representatives  some of whom have trained f

In [14]:
# RDD.collect()

In [17]:
rdd_filtered = rdd.filter(remove_nonenglish).map(remove_punc).map(genre_to_array)

In [18]:
rdd_filtered.take(3)

[('Suzanne Collins',
  'winning will make you famous  losing means certain death the nation of panem  formed from a post-apocalyptic north america  is a country that consists of a wealthy capitol region surrounded by 12 poorer districts  early in its history  a rebellion led by a 13th district against the capitol resulted in its destruction and the creation of an annual televised event known as the hunger games  in punishment  and as a reminder of the power and grace of the capitol  each district must yield one boy and one girl between the ages of 12 and 18 through a lottery system to participate in the games  the tributes are chosen during the annual reaping and are forced to fight to the death  leaving only one survivor to claim victory when 16-year-old katnisss young sister  prim  is selected as district 12s female representative  katniss volunteers to take her place  she and her male counterpart peeta  are pitted against bigger  stronger representatives  some of whom have trained f

### TF/IDF

One thing that will need to happen is to use regex to transform the genres into a dataframe

In [68]:
header_df = df.schema.names
header_df

# TO DO
# SEE ERROR BELOW

['book_authors',
 'book_desc',
 'book_edition',
 'book_format',
 'book_isbn',
 'book_pages',
 'book_rating',
 'book_rating_count',
 'book_review_count',
 'book_title',
 'genres',
 'image_url']

### Training and Testing datsets

Currently using ~10% of the dataset (can be adjusted).

In [66]:
# train, test = df.randomSplit([0.1, 0.9])

# train_desc = train.select('book_desc').collect()
# train_genre = train.select('genres').collect()

# test_desc = test.select('book_desc').collect()
# test_genre = test.select('genres').collect()

### Bayes Classification

In [3]:
# model = MultinomialNB().fit(train_desc, train_genre)
# score = model.score(test_desc, test_genre)
# score