In [0]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [0]:
# scikit
import sklearn
from scipy.io import loadmat
from sklearn.naive_bayes import MultinomialNB

In [0]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect
!{sys.executable} -m pip install nltk

In [0]:
# %load_ext autoreload
# %autoreload 2
import nltk
nltk.download('stopwords')

In [0]:
nltk.download('punkt')

In [0]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region

In [0]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [0]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

In [0]:
rdd = df.rdd.map(tuple)

In [0]:
from langdetect import detect

def remove_nonenglish(row):
    # Returns True if the tuple's description is written in English, false otherwise    
    
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    # Returns an updated description string with punctuation directly between two letters replaced with a space
    
    new_desc=''
    
    for i in range(len(desc)-1):
        new_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            new_desc+=' '
    
    new_desc+=desc[-1]
    
    return new_desc 

In [0]:
import string

def clean_words(row):
    # Returns tuple with description cleaned 
    # Removes punctuation, tokenizes words, stems them for comparision, filters out stop words
    
    desc = row[1] 
    
    desc = replace_punc_with_space(desc) #Some words in descriptions are not separated by a space, but with punctuation
    desc = desc.lower() #make all lowercase for easy comparing
    
    # split into words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(desc)
    
    # remove punctuation from each word
    punc = str.maketrans('', '', string.punctuation)
    no_punc = [word.translate(punc) for word in words]
    
    # remove remaining tokens that are not alphabetic
    words_alpha = [word for word in no_punc if word.isalpha()]
    
    
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words_alpha if not w in stop_words]    
    
    # stem the words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    
    lst = list(row)
    lst[1] = words
    tup = tuple(lst)

    return tup
    

In [0]:
def genre_to_array(row):
    # Returns tuple with genres turned into an array
    
    genres = row[10]
    glist = []
    if(genres is not None): glist = genres.split('|')
    if "Nonfiction" in glist: glist = 0.0
    elif "Fiction" in glist: glist = 1.0
    else: glist = None
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

In [0]:
def remove_null_genre(row):
    if row[10] is None: return False
    else: return True

In [0]:
rdd_filtered = rdd.filter(remove_nonenglish).map(clean_words).map(genre_to_array)

In [0]:
rdd_filtered = rdd_filtered.filter(remove_null_genre)

In [0]:
# replace with toDF with the smaller source data
rdd_sample = sc.parallelize(rdd_filtered.take(100))

In [0]:
rdd_sample.count()

In [0]:
booksdf = rdd_sample.toDF(['author', 'description', 'edition', 'format', 
                    'isbn13', 'pages', 'rating', 'ratingCount', 
                    'review_count', 'title', 'genres', 'image_url']) \
    .drop("edition") \
    .drop("format") \
    .drop("pages") \
    .drop("isbn13") \
    .drop("review_count") \
    .drop("image_url")

booksdf = booksdf.withColumn("genre", booksdf["genres"]) 
booksdf = booksdf.dropna(subset=("genres", 'genre'))
booksdf.printSchema()

In [0]:
hashingTF = HashingTF(inputCol="description", outputCol="rawFeatures", numFeatures=32)
featurizedData = hashingTF.transform(booksdf)

In [0]:
featurizedData.select('description', 'rawFeatures').show(10)


In [0]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [0]:
rescaledData.select("title", "genres", "features").show(10)

In [0]:
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="genre", predictionCol="prediction", metricName="accuracy")

In [0]:
dtPredictions = dtModel.transform(testData)
dtPredictions.select("prediction", "genres", "features").show(10)

In [0]:
dtAccuracy = evaluator.evaluate(dtPredictions)
print(dtAccuracy)

In [0]:
dt = DecisionTreeClassifier(labelCol="genres", featuresCol="features", cacheNodeIds = True, checkpointInterval = 10, impurity = 'entropy')
dtModel = dt.fit(trainingData)