In [0]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, udf

In [0]:
# pyspark
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, HashingTF, IDF, RegexTokenizer, StopWordsRemover, Bucketizer, QuantileDiscretizer
from pyspark.ml.linalg import Vectors

In [0]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect
!{sys.executable} -m pip install nltk

[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 8.1.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [0]:
# %load_ext autoreload
# %autoreload 2
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:


region = "us-east-2" 
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")



In [0]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

In [0]:
rdd = df.rdd.map(tuple)

In [0]:
from langdetect import detect

def remove_nonenglish(row):
    # Returns True if the tuple's description is written in English, false otherwise    
    
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    # Returns an updated description string with punctuation directly between two letters replaced with a space
    
    new_desc=''
    
    for i in range(len(desc)-1):
        new_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            new_desc+=' '
    
    new_desc+=desc[-1]
    
    return new_desc 

In [0]:
import string

def clean_words(row):
    # Returns tuple with description cleaned 
    # Removes punctuation, tokenizes words, stems them for comparision, filters out stop words
    
    desc = row[1] 
    
    desc = replace_punc_with_space(desc) #Some words in descriptions are not separated by a space, but with punctuation
    desc = desc.lower() #make all lowercase for easy comparing
    
    # split into words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(desc)
    
    # remove punctuation from each word
    punc = str.maketrans('', '', string.punctuation)
    no_punc = [word.translate(punc) for word in words]
    
    # remove remaining tokens that are not alphabetic
    words_alpha = [word for word in no_punc if word.isalpha()]
    
    
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words_alpha if not w in stop_words]    
    
    # stem the words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    
    lst = list(row)
    lst[1] = words
    tup = tuple(lst)

    return tup
    

In [0]:
def genre_to_array(row):
    # Returns tuple with genres turned into an array
    
    genres = row[10]
    glist = []
    if(genres is not None): glist = genres.split('|')
    if "Nonfiction" in glist: glist = 0.0
    elif "Fiction" in glist: glist = 1.0
    else: glist = None
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

In [0]:
def remove_null_genre(row):
    if row[10] is None: return False
    else: return True

In [0]:
rdd_filtered = rdd.filter(remove_nonenglish).map(clean_words).map(genre_to_array)

In [0]:
rdd_filtered = rdd_filtered.filter(remove_null_genre)

In [0]:
rdd_sample = sc.parallelize(rdd_filtered.take(100))

In [0]:
rdd_sample.count()

100

In [0]:
def dataset():
    return rdd_sample

+--------------------+--------------------+
|         description|         rawFeatures|
+--------------------+--------------------+
|[alaska, unforgiv...|(32,[0,1,2,3,4,5,...|
|[house, helios, g...|(32,[0,1,2,3,4,5,...|
|[anna, fox, lives...|(32,[1,2,3,5,6,7,...|
|[killed, motherth...|(32,[0,1,2,4,5,6,...|
|[newlyweds, celes...|(32,[1,2,4,5,6,7,...|
|[read, book, make...|(32,[0,1,2,3,4,5,...|
|[knew, date, deat...|(32,[1,2,3,4,5,6,...|
|[unspeakable, cri...|(32,[0,1,2,3,5,6,...|
|[heartwarming, re...|(32,[0,1,2,3,4,5,...|
|[years, rumors, h...|(32,[1,2,3,4,5,8,...|
+--------------------+--------------------+
only showing top 10 rows



+--------------------+------+--------------------+
|               title|genres|            features|
+--------------------+------+--------------------+
|     The Great Alone|   1.0|(32,[0,1,2,3,4,5,...|
|               Circe|   1.0|(32,[0,1,2,3,4,5,...|
|The Woman in the ...|   1.0|(32,[1,2,3,5,6,7,...|
|Children of Blood...|   1.0|(32,[0,1,2,4,5,6,...|
|An American Marriage|   1.0|(32,[1,2,4,5,6,7,...|
| The Wife Between Us|   1.0|(32,[0,1,2,3,4,5,...|
|    The Immortalists|   1.0|(32,[1,2,3,4,5,6,...|
|        The Outsider|   1.0|(32,[0,1,2,3,5,6,...|
|   The Kiss Quotient|   1.0|(32,[0,1,2,3,4,5,...|
|Where the Crawdad...|   1.0|(32,[1,2,3,4,5,8,...|
+--------------------+------+--------------------+
only showing top 10 rows



+----------+------+--------------------+
|prediction|genres|            features|
+----------+------+--------------------+
|       1.0|   1.0|(32,[1,2,3,5,6,7,...|
|       0.0|   1.0|(32,[0,1,2,4,5,6,...|
|       1.0|   1.0|(32,[0,1,2,3,4,5,...|
|       1.0|   1.0|(32,[1,2,3,5,6,7,...|
|       1.0|   1.0|(32,[0,1,2,3,5,6,...|
|       1.0|   1.0|(32,[0,1,2,3,4,5,...|
|       1.0|   1.0|(32,[0,2,3,4,5,6,...|
|       1.0|   1.0|(32,[0,1,2,3,4,5,...|
|       1.0|   1.0|(32,[0,2,4,5,6,7,...|
|       1.0|   1.0|(32,[0,1,3,5,6,7,...|
+----------+------+--------------------+
only showing top 10 rows



0.7317073170731707
