# Setup

First obtain dependencies

In [1]:
# Pyspark SQL
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, udf

In [2]:
# pyspark
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, HashingTF, IDF, RegexTokenizer, StopWordsRemover, StringIndexer, IndexToString, Bucketizer, QuantileDiscretizer
from pyspark.ml.linalg import Vectors

In [3]:
# %load_ext autoreload
# %autoreload 2
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.stem.porter import PorterStemmer

In [5]:
# Install a pip package (langdetect) in the current Jupyter kernel
import sys
!{sys.executable} -m pip install langdetect

[33mYou are using pip version 8.1.2, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
%load_ext autoreload
%autoreload 2

Fill in the following data

In [7]:
# Login information
# username = # AWS Username
# password = # AWS Password
# region = "us-east-1" # Change if different from your AWS region


# Dataset location
# s3 = #s3a address



Connect to the AWS resources

In [8]:
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", username)
sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", password)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + region + ".amazonaws.com")

In [9]:
df = sqlContext.read.format("csv").option("header", "true").load(s3)

In [10]:
#test, df = df.randomSplit([0.8, 0.2])

# Project

In [11]:
rdd = df.rdd.map(tuple)
# small = sc.parallelize(rdd.take(10))

## Preprocessing

Removes non-English data from the dataset

In [12]:
from langdetect import detect

def remove_nonenglish(row):
    # Returns True if the tuple's description is written in English, false otherwise    
    
    try:
        lang=detect(row[1])
        if (lang == 'en'): 
            return True
        else: 
            return False
    except:
        return False

def replace_punc_with_space(desc):
    # Returns an updated description string with punctuation directly between two letters replaced with a space
    
    new_desc=''
    
    for i in range(len(desc)-1):
        new_desc+=desc[i]
        if desc[i].islower() and desc[i+1].isupper():
            new_desc+=' '
    
    new_desc+=desc[-1]
    
    return new_desc 

Genres converted to array

In [13]:
def genre_to_array(row):
    genres = row[10]
    glist = []
    if(genres is not None): glist = genres.split('|')
    if "Nonfiction" in glist: glist = 0.0
    elif "Fiction" in glist: glist = 1.0
    else: glist = None
    
    lst = list(row)
    lst[10] = glist
    tup = tuple(lst)

    return tup

In [14]:
import string

def clean_words(row):
    # Returns tuple with description cleaned 
    # Removes punctuation, tokenizes words, stems them for comparision, filters out stop words
    
    desc = row[1] 
    
    desc = replace_punc_with_space(desc) #Some words in descriptions are not separated by a space, but with punctuation
    desc = desc.lower() #make all lowercase for easy comparing
    
    # split into words
    from nltk.tokenize import word_tokenize
    words = word_tokenize(desc)
    
    # remove punctuation from each word
    punc = str.maketrans('', '', string.punctuation)
    no_punc = [word.translate(punc) for word in words]
    
    # remove remaining tokens that are not alphabetic
    words_alpha = [word for word in no_punc if word.isalpha()]
    
    
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words_alpha if not w in stop_words]    
    
    # stem the words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    
    lst = list(row)
    lst[1] = words
    tup = tuple(lst)

    return tup

In [15]:
def remove_null_genre(row):
    if row[10] is None: return False
    else: return True

Applies the above processes to rdd

In [16]:
rdd = rdd.filter(remove_nonenglish).map(clean_words).map(genre_to_array)
rdd = rdd.filter(remove_null_genre)

In [17]:
print(rdd.take(1))

[('Kristin Hannah', ['alaska', 'unforgiving', 'untamedfor', 'family', 'crisis', 'ultimate', 'test', 'survivalernt', 'allbright', 'former', 'pow', 'comes', 'home', 'vietnam', 'war', 'changed', 'volatile', 'man', 'loses', 'yet', 'another', 'job', 'makes', 'impulsive', 'decision', 'move', 'family', 'north', 'alaska', 'live', 'grid', 'last', 'true', 'frontierthirteenyearold', 'leni', 'girl', 'coming', 'age', 'tumultuous', 'time', 'caught', 'riptide', 'passionate', 'stormy', 'relationship', 'dares', 'hope', 'new', 'land', 'lead', 'better', 'future', 'family', 'desperate', 'place', 'belong', 'mother', 'cora', 'anything', 'go', 'anywhere', 'man', 'loves', 'even', 'means', 'following', 'unknown', 'first', 'alaska', 'seems', 'answer', 'prayers', 'wild', 'remote', 'corner', 'state', 'find', 'fiercely', 'independent', 'community', 'strong', 'men', 'even', 'stronger', 'women', 'long', 'sunlit', 'days', 'generosity', 'locals', 'make', 'lack', 'preparation', 'dwindling', 'resourcesbut', 'winter', 'a

Convert data to dataframe with header names and cast datatypes

In [18]:
booksdf = rdd.toDF(['author', 'description', 'edition', 'format', 
                    'isbn13', 'pages', 'rating', 'ratingCount', 
                    'review_count', 'title', 'genres', 'image_url']) \
    .drop("edition") \
    .drop("format") \
    .drop("pages") \
    .drop("isbn13") \
    .drop("review_count") \
    .drop("image_url")

booksdf = booksdf.withColumn("rating", booksdf["rating"].cast("decimal(3,2)")) \
                .withColumn("ratingCount", booksdf["ratingCount"].cast("long"))
booksdf.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- ratingCount: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: double (nullable = true)



Drop all elements with a null description or rating

In [19]:
booksdf = booksdf.dropna(subset=('description', 'rating', 'genres'))
booksdf[("title", "rating", "genres")].show(10)

+--------------------+------+------+
|               title|rating|genres|
+--------------------+------+------+
|     The Great Alone|  4.33|   1.0|
|               Circe|  4.34|   1.0|
|The Woman in the ...|  3.97|   1.0|
|Children of Blood...|  4.25|   1.0|
|An American Marriage|  4.01|   1.0|
| The Wife Between Us|  3.85|   1.0|
|    The Immortalists|  3.73|   1.0|
|        The Outsider|  4.07|   1.0|
|   The Kiss Quotient|  4.00|   1.0|
|Where the Crawdad...|  4.52|   1.0|
+--------------------+------+------+
only showing top 10 rows



In [20]:
booksdf.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- ratingCount: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: double (nullable = true)



In [21]:
#tokenizer = RegexTokenizer(inputCol="description", outputCol="descToken", pattern="\\W")
#booksdf = tokenizer.transform(booksdf)

#swremover = StopWordsRemover(inputCol="descToken", outputCol="desc")
#booksdf = swremover.transform(booksdf)

booksdf[("description","rating","ratingCount")].show(10)

+--------------------+------+-----------+
|         description|rating|ratingCount|
+--------------------+------+-----------+
|[alaska, unforgiv...|  4.33|     146505|
|[house, helios, g...|  4.34|      61357|
|[anna, fox, lives...|  3.97|     175678|
|[killed, motherth...|  4.25|      56789|
|[newlyweds, celes...|  4.01|      91515|
|[read, book, make...|  3.85|      89835|
|[knew, date, deat...|  3.73|      61815|
|[unspeakable, cri...|  4.07|      65894|
|[heartwarming, re...|  4.00|      33760|
|[years, rumors, h...|  4.52|      40414|
+--------------------+------+-----------+
only showing top 10 rows



### Bucket

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.QuantileDiscretizer

One method is with **QuantileDiscretizer**, which buckets by frequency (equal number in each bucket).

Adjust **numBuckets** to change the bucket sizes for the ratings and get different outcomes. The lower number of buckets, the more book ratings per bucket.

Sample code:

```
discretizer = QuantileDiscretizer(numBuckets=16, inputCol='rating', outputCol='label')
booksdf = discretizer.fit(booksdf).transform(booksdf)
```

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer

Alternative is to use **Bucketizer**, which buckets by length (or range). Adjust the numbers in the range to fix the outcome.

Sample code:

```
splitWhole = [0,1,2,3,4,5]
splitQuarter = [0,0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,3,3.25,3.5,3.75,4,4.25,4.5,4.75,5]
splitHalf = [0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5]
bucketizer = Bucketizer(splits=split, inputCol='rating', outputCol='label')
booksdf = bucketizer.transform(booksdf)
```

In [22]:
split = [0,1,2,3,4,5]
bucketizer = Bucketizer(splits=split, inputCol='rating', outputCol='label')
booksdf = bucketizer.transform(booksdf)

In [23]:
booksdf.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- ratingCount: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: double (nullable = true)
 |-- label: double (nullable = true)



#### Get weights for ratings

Convert the number of ratings into weight for the ratings

In [26]:
assembler = VectorAssembler(inputCols=["ratingCount"], outputCol="countVec")
scaler = MinMaxScaler(inputCol='countVec', outputCol='weightVec')

In [27]:
pipeline = Pipeline(stages=[assembler, scaler])
booksdf = pipeline.fit(booksdf).transform(booksdf)

In [28]:
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = udf(ith_, DoubleType())
booksdf = booksdf.withColumn("weight", ith("weightVec", lit(0)))

In [29]:
booksdf[("title", "rating", "label", "weight")].show(10)

+--------------------+------+-----+-------------------+
|               title|rating|label|             weight|
+--------------------+------+-----+-------------------+
|     The Great Alone|  4.33|  4.0|  0.833929161021484|
|               Circe|  4.34|  4.0|0.34921384900891467|
|The Woman in the ...|  3.97|  3.0|                1.0|
|Children of Blood...|  4.25|  4.0|0.32320995525599716|
|An American Marriage|  4.01|  4.0| 0.5208919198934341|
| The Wife Between Us|  3.85|  3.0| 0.5113283162364942|
|    The Immortalists|  3.73|  3.0| 0.3518210695296756|
|        The Outsider|  4.07|  4.0|0.37504127150387667|
|   The Kiss Quotient|  4.00|  4.0| 0.1921145810800041|
|Where the Crawdad...|  4.52|  4.0|0.22999328270695524|
+--------------------+------+-----+-------------------+
only showing top 10 rows



#### Indexer

In [16]:
indexer = StringIndexer(inputCol="genre", outputCol="genreLabel")

In [17]:
booksdf = indexer.fit(booksdf).transform(booksdf)

In [18]:
booksdf = booksdf.dropna(subset=('genre', 'genreLabel'))
booksdf[("title", "genres", "genreLabel")].show(10)

+--------------------+--------------------+----------+
|               title|              genres|genreLabel|
+--------------------+--------------------+----------+
|     The Great Alone|[Fiction, Histori...|       1.0|
|               Circe|[Fantasy, Fiction...|       2.0|
|    The Cruel Prince|[Fantasy, Young A...|       2.0|
|The Woman in the ...|[Mystery, Thrille...|       3.0|
|Children of Blood...|[Fantasy, Young A...|       2.0|
|An American Marriage|[Fiction, Contemp...|       1.0|
| The Wife Between Us|[Thriller, Fictio...|       9.0|
|         Thunderhead|[Young Adult, Sci...|       5.0|
|A Court of Frost ...|[Fantasy, New Adu...|       2.0|
|    The Immortalists|[Fiction, Fantasy...|       1.0|
+--------------------+--------------------+----------+
only showing top 10 rows



### TF/IDF

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF

**numFeatures should be adjusted to get better outcomes**

> Since a simple modulo is used to transform the hash function to a column index, it is advisable to **use a power of two** as the numFeatures parameter; otherwise the features will not be mapped evenly to the columns.

In [24]:
hashingTF = HashingTF(inputCol="description", outputCol="rawFeatures", numFeatures=32)
featurizedData = hashingTF.transform(booksdf)

In [25]:
featurizedData.select('description', 'rawFeatures').show(10)

+--------------------+--------------------+
|         description|         rawFeatures|
+--------------------+--------------------+
|[alaska, unforgiv...|(32,[0,1,2,3,4,5,...|
|[house, helios, g...|(32,[0,1,2,3,4,5,...|
|[anna, fox, lives...|(32,[1,2,3,5,6,7,...|
|[killed, motherth...|(32,[0,1,2,4,5,6,...|
|[newlyweds, celes...|(32,[1,2,4,5,6,7,...|
|[read, book, make...|(32,[0,1,2,3,4,5,...|
|[knew, date, deat...|(32,[1,2,3,4,5,6,...|
|[unspeakable, cri...|(32,[0,1,2,3,5,6,...|
|[heartwarming, re...|(32,[0,1,2,3,4,5,...|
|[years, rumors, h...|(32,[1,2,3,4,5,8,...|
+--------------------+--------------------+
only showing top 10 rows



In [26]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [27]:
rescaledData.select("rating", "label", "description", "features").show(10)

+------+-----+--------------------+--------------------+
|rating|label|         description|            features|
+------+-----+--------------------+--------------------+
|  4.33|  4.0|[alaska, unforgiv...|(32,[0,1,2,3,4,5,...|
|  4.34|  4.0|[house, helios, g...|(32,[0,1,2,3,4,5,...|
|  3.97|  3.0|[anna, fox, lives...|(32,[1,2,3,5,6,7,...|
|  4.25|  4.0|[killed, motherth...|(32,[0,1,2,4,5,6,...|
|  4.01|  4.0|[newlyweds, celes...|(32,[1,2,4,5,6,7,...|
|  3.85|  3.0|[read, book, make...|(32,[0,1,2,3,4,5,...|
|  3.73|  3.0|[knew, date, deat...|(32,[1,2,3,4,5,6,...|
|  4.07|  4.0|[unspeakable, cri...|(32,[0,1,2,3,5,6,...|
|  4.00|  4.0|[heartwarming, re...|(32,[0,1,2,3,4,5,...|
|  4.52|  4.0|[years, rumors, h...|(32,[1,2,3,4,5,8,...|
+------+-----+--------------------+--------------------+
only showing top 10 rows



### Training and Testing datsets

In [28]:
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])

In [29]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

### Descision Tree Classification

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier

Modifying parameters

In [30]:
trainingData.select('title', 'description', 'features', 'rating', 'label').show(10)

+--------------------+--------------------+--------------------+------+-----+
|               title|         description|            features|rating|label|
+--------------------+--------------------+--------------------+------+-----+
|      Dear Mrs. Bird|[listening, lengt...|(32,[0,2,3,4,5,6,...|  3.81|  3.0|
|Ask Me About My U...|[woman, experienc...|(32,[0,1,2,3,5,6,...|  3.95|  3.0|
|The Enchanted Gar...|[sixth, birthday,...|(32,[0,1,3,4,5,6,...|  4.34|  4.0|
|         Dogs of War|[name, rex, good,...|(32,[0,2,3,4,5,6,...|  4.33|  4.0|
|     Sky in the Deep|[part, wonder, wo...|(32,[0,1,2,4,5,6,...|  4.05|  4.0|
|  The Perfect Mother|[vanity, fair, ca...|(32,[0,1,2,3,4,5,...|  3.55|  3.0|
|Daughters of the ...|[celebrated, regi...|(32,[0,1,2,3,4,5,...|  4.19|  4.0|
|            The Wife|[scandal, secretw...|(32,[0,1,2,3,4,5,...|  3.93|  3.0|
|The Sparsholt Affair|[internationally,...|(32,[0,1,2,3,4,5,...|  3.59|  3.0|
|      The Book Ninja|[sometimes, love,...|(32,[0,1,2,3,5,6,...|

In [31]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", cacheNodeIds = True, checkpointInterval = 10, impurity = 'gini')
dtModel = dt.fit(trainingData)

In [32]:
dtPredictions = dtModel.transform(testData)
dtPredictions.select("prediction", "rating","label","description", "features").show(10)

+----------+------+-----+--------------------+--------------------+
|prediction|rating|label|         description|            features|
+----------+------+-----+--------------------+--------------------+
|       3.0|  4.51|  4.0|[author, new, yor...|(32,[0,3,5,7,8,9,...|
|       3.0|  3.97|  3.0|[anna, fox, lives...|(32,[1,2,3,5,6,7,...|
|       3.0|  3.99|  3.0|[flavia, enjoying...|(32,[1,2,3,5,6,7,...|
|       4.0|  3.98|  3.0|[bestselling, aut...|(32,[0,1,2,3,4,5,...|
|       3.0|  3.77|  3.0|[stunning, new, p...|(32,[0,1,2,3,5,6,...|
|       4.0|  4.13|  4.0|[acclaimed, autho...|(32,[0,1,2,3,4,5,...|
|       4.0|  3.90|  3.0|[new, york, times...|(32,[0,1,2,3,4,5,...|
|       4.0|  3.99|  3.0|[blazingly, origi...|(32,[0,1,2,4,5,6,...|
|       4.0|  3.91|  3.0|[smart, funny, ne...|(32,[0,1,2,3,4,5,...|
|       3.0|  3.40|  3.0|[betts, meets, ai...|(32,[1,2,3,4,5,7,...|
+----------+------+-----+--------------------+--------------------+
only showing top 10 rows



In [33]:
dtAccuracy = evaluator.evaluate(dtPredictions)
print(dtAccuracy)

0.5490196078431373


In [35]:
dtPredictions.select("prediction", "label", "features").show(10)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       3.0|  4.0|(32,[0,3,5,7,8,9,...|
|       3.0|  3.0|(32,[1,2,3,5,6,7,...|
|       3.0|  3.0|(32,[1,2,3,5,6,7,...|
|       4.0|  3.0|(32,[0,1,2,3,4,5,...|
|       3.0|  3.0|(32,[0,1,2,3,5,6,...|
|       4.0|  4.0|(32,[0,1,2,3,4,5,...|
|       4.0|  3.0|(32,[0,1,2,3,4,5,...|
|       4.0|  3.0|(32,[0,1,2,4,5,6,...|
|       4.0|  3.0|(32,[0,1,2,3,4,5,...|
|       3.0|  3.0|(32,[1,2,3,4,5,7,...|
+----------+-----+--------------------+
only showing top 10 rows



### Logistic Regression with Cross Validation

In [30]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label", \
                        featuresCol="features", \
                        maxIter=20, \
                        regParam=0.3, \
                        elasticNetParam=0)
lrModel = lr.fit(trainingData)

In [31]:
lrPredictions = lrModel.transform(testData)
lrPredictions.select("prediction", "label", "features").show(10)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       3.0|  4.0|(32,[0,3,5,7,8,9,...|
|       3.0|  3.0|(32,[1,2,3,5,6,7,...|
|       4.0|  3.0|(32,[0,1,2,3,5,6,...|
|       4.0|  4.0|(32,[0,2,3,4,5,6,...|
|       3.0|  3.0|(32,[0,1,2,3,4,5,...|
|       3.0|  3.0|(32,[0,1,2,3,4,5,...|
|       4.0|  3.0|(32,[0,1,2,3,4,5,...|
|       3.0|  3.0|(32,[0,1,2,3,4,5,...|
|       3.0|  3.0|(32,[0,1,2,3,4,5,...|
|       3.0|  3.0|(32,[0,1,2,3,4,5,...|
+----------+-----+--------------------+
only showing top 10 rows



In [None]:
lrAccuracy = evaluator.evaluate(lrPredictions)
print(lrAccuracy)

0.5714285714285714


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(trainingData)

In [None]:
cvPredictions = cvModel.transform(testData)
cvPredictions.select("prediction", "label", "features").show(10)

In [None]:
cvAccuracy = evaluator.evaluate(cvPredictions)
print(cvAccuracy)

### Random Forest Classification

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier

In [None]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees=100, \
                            maxDepth = 4, \
                            maxBins = 32)
rfModel = rf.fit(trainingData)

In [None]:
rfPredictions = rfModel.transform(testData)
rfPredictions.select("prediction", "label", "features").show(10)

In [None]:
rfAccuracy = evaluator.evaluate(rfPredictions)
print(rfAccuracy)

### Naive Bayes Multinomial Classification

https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.classification.NaiveBayes

In [36]:
trainingData.printSchema()

root
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- rating: decimal(3,2) (nullable = true)
 |-- ratingCount: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = true)
 |-- countVec: vector (nullable = true)
 |-- weightVec: vector (nullable = true)
 |-- weight: double (nullable = true)
 |-- descToken: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- desc: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)



In [33]:
nb = NaiveBayes(labelCol="label", featuresCol="features", modelType="multinomial")
nbModel = nb.fit(trainingData)

In [34]:
nbPredictions = nbModel.transform(testData)
nbPredictions.select("prediction", "label", "features").show(10)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       2.0|  4.0|(32,[0,1,2,3,4,5,...|
|       2.0|  3.0|(32,[0,1,2,3,4,5,...|
|       2.0|  3.0|(32,[0,1,2,3,4,5,...|
|       2.0|  4.0|(32,[0,1,2,3,4,5,...|
|       2.0|  3.0|(32,[0,2,3,4,5,6,...|
|       2.0|  4.0|(32,[0,1,2,3,5,6,...|
|       2.0|  3.0|(32,[0,1,2,3,5,6,...|
|       2.0|  3.0|(32,[0,1,2,3,4,5,...|
|       2.0|  4.0|(32,[0,1,2,3,4,5,...|
|       2.0|  3.0|(32,[0,1,2,4,5,6,...|
+----------+-----+--------------------+
only showing top 10 rows



In [35]:
nbAccuracy = evaluator.evaluate(nbPredictions)
print(nbAccuracy)

0.0


In [None]:
nb = NaiveBayes(labelCol="label", featuresCol="features", modelType="multinomial", weightCol="weight")
nbModel = nb.fit(trainingData)

In [None]:
nbPredictions = nbModel.transform(testData)
nbPredictions.select("prediction", "label", "features").show(10)

In [None]:
nbAccuracy = evaluator.evaluate(nbPredictions)
print(nbAccuracy)