In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("news_classification").getOrCreate()
spark

In [2]:
from pyspark.ml.feature import * #CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import * #col, udf,regexp_replace,isnull
from pyspark.sql.types import * #StringType,IntegerType
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# For pipeline development
from pyspark.ml import Pipeline 

In [3]:
df = spark.read.csv("./data/uci-news-aggregator.csv", inferSchema=True, header=True)
df.limit(2).toPandas()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207


In [4]:
df.count()

422937

In [5]:
news_category = df.select("TITLE","CATEGORY")
news_category.show(2)

+--------------------+--------+
|               TITLE|CATEGORY|
+--------------------+--------+
|Fed official says...|       b|
|Fed's Charles Plo...|       b|
+--------------------+--------+
only showing top 2 rows



In [8]:
from pyspark.sql.functions import *

def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        if(nullRows > 0):
            temp = k,nullRows,(nullRows/numRows)*100
            null_columns_counts.append(temp)
    return(null_columns_counts)

null_columns_calc_list = null_value_calc(news_category)
spark.createDataFrame(null_columns_calc_list, ['Column_Name', 'Null_Values_Count','Null_Value_Percent']).show()

ValueError: can not infer schema from empty dataset

In [9]:
news_category.groupBy("Category").count().orderBy(col("count").desc()).show(truncate=False)

+--------------------+------+
|Category            |count |
+--------------------+------+
|e                   |152127|
|b                   |115935|
|t                   |108237|
|m                   |45616 |
|Us Magazine         |31    |
|Contactmusic.com    |20    |
|GossipCop           |20    |
|Complex.com         |12    |
|CBS News            |12    |
|The Hollywood Gossip|11    |
|HipHopDX            |11    |
|We Got This Covered |10    |
|HeadlinePlanet.com  |10    |
|Gamepur             |8     |
|Consequence of Sound|7     |
|Wetpaint            |7     |
|WorstPreviews.com   |7     |
|TooFab.com          |7     |
|The Escapist        |6     |
|Reality TV World    |5     |
+--------------------+------+
only showing top 20 rows



In [7]:
news_category = news_category.dropna()

In [10]:
news_category.show(10,truncate=False)

+------------------------------------------------------------------------+--------+
|TITLE                                                                   |CATEGORY|
+------------------------------------------------------------------------+--------+
|Fed official says weak data caused by weather, should not slow taper    |b       |
|Fed's Charles Plosser sees high bar for change in pace of tapering      |b       |
|US open: Stocks fall after Fed official hints at accelerated tapering   |b       |
|Fed risks falling 'behind the curve', Charles Plosser says              |b       |
|Fed's Plosser: Nasty Weather Has Curbed Job Growth                      |b       |
|Plosser: Fed May Have to Accelerate Tapering Pace                       |b       |
|Fed's Plosser: Taper pace may be too slow                               |b       |
|Fed's Plosser expects US unemployment to fall to 6.2% by the end of 2014|b       |
|US jobs growth last month hit by weather:Fed President Charles Plosser  |b 

In [15]:
news_cat = news_category.sampleBy("CATEGORY", fractions={'e': 0.1, 'b': 0.1, 't' : 0.1 , 'm' : 0.25}, seed=0)

news_cat.groupBy("Category").count().orderBy(col("count").desc()).show(truncate=False)

+--------+-----+
|Category|count|
+--------+-----+
|e       |15107|
|b       |11584|
|m       |11350|
|t       |10873|
+--------+-----+



In [16]:
news_cat.show(10)

+--------------------+--------+
|               TITLE|CATEGORY|
+--------------------+--------+
|US open: Stocks f...|       b|
|ECB unlikely to e...|       b|
|Noyer Says Strong...|       b|
|Eurozone banks' s...|       b|
|Omega's Cooperman...|       b|
|EBay asks shareho...|       b|
|EBay asks shareho...|       b|
|eBay's John Donah...|       b|
|McDonald's Report...|       b|
|Burn: McDonald's ...|       b|
+--------------------+--------+
only showing top 10 rows



In [17]:
news_cat = news_cat.withColumn("TITLE", regexp_replace(col('TITLE'), '[^A-Za-z ]+', ''))

In [18]:
news_cat = news_cat.withColumn("TITLE", lower(col('TITLE')))

In [19]:
regex_tokenizer = RegexTokenizer(inputCol="TITLE", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(news_cat)
raw_words.show(2,False)
raw_words.printSchema()

+--------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+
|TITLE                                                               |CATEGORY|words                                                                           |
+--------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+
|us open stocks fall after fed official hints at accelerated tapering|b       |[us, open, stocks, fall, after, fed, official, hints, at, accelerated, tapering]|
|ecb unlikely to end sterilisation of smp purchases  traders         |b       |[ecb, unlikely, to, end, sterilisation, of, smp, purchases, traders]            |
+--------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+
only showing top 2 rows

root
 |--

In [20]:
# from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [15]:
words_df = remover.transform(raw_words)
words_df.show(1,False)

+-------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+---------------------------------------------------------------+
|TITLE                                                              |CATEGORY|words                                                                           |filtered                                                       |
+-------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+---------------------------------------------------------------+
|fed official says weak data caused by weather should not slow taper|b       |[fed, official, says, weak, data, caused, by, weather, should, not, slow, taper]|[fed, official, says, weak, data, caused, weather, slow, taper]|
+-------------------------------------------------------------------+--------+--------------------------

In [16]:
indexer = StringIndexer(inputCol="CATEGORY", outputCol="label")
feature_data = indexer.fit(words_df).transform(words_df)
feature_data.show(5)
feature_data.printSchema()

+--------------------+--------+--------------------+--------------------+-----+
|               TITLE|CATEGORY|               words|            filtered|label|
+--------------------+--------+--------------------+--------------------+-----+
|fed official says...|       b|[fed, official, s...|[fed, official, s...|  1.0|
|feds charles plos...|       b|[feds, charles, p...|[feds, charles, p...|  1.0|
|us open stocks fa...|       b|[us, open, stocks...|[us, open, stocks...|  1.0|
|fed risks falling...|       b|[fed, risks, fall...|[fed, risks, fall...|  1.0|
|feds plosser nast...|       b|[feds, plosser, n...|[feds, plosser, n...|  1.0|
+--------------------+--------+--------------------+--------------------+-----+
only showing top 5 rows

root
 |-- TITLE: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |

In [21]:
regex_tokenizer = RegexTokenizer(inputCol="TITLE", outputCol="words", pattern="\\W")
# raw_words = regex_tokenizer.transform(news_category)

remover = StopWordsRemover(inputCol=regex_tokenizer.getOutputCol(), outputCol="filtered")
# words_df = remover.transform(raw_words)

indexer = StringIndexer(inputCol="CATEGORY", outputCol="label")
# feature_data = indexer.fit(words_df).transform(words_df)

pipeline = Pipeline(stages=[regex_tokenizer,remover,indexer])
data_prep_pl = pipeline.fit(news_cat)


feature_data = data_prep_pl.transform(news_cat)
feature_data.show(1,False)

+--------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+---------------------------------------------------------------------+-----+
|TITLE                                                               |CATEGORY|words                                                                           |filtered                                                             |label|
+--------------------------------------------------------------------+--------+--------------------------------------------------------------------------------+---------------------------------------------------------------------+-----+
|us open stocks fall after fed official hints at accelerated tapering|b       |[us, open, stocks, fall, after, fed, official, hints, at, accelerated, tapering]|[us, open, stocks, fall, fed, official, hints, accelerated, tapering]|1.0  |
+---------------------------------------------------

In [30]:
# Count Vector (count vectorizer and hashingTF are basically the same thing)
# cv = CountVectorizer(inputCol="filtered", outputCol="features")
# model = cv.fit(feature_data)
# countVectorizer_features = model.transform(feature_data)

# Hashing TF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawfeatures", numFeatures=20)
HTFfeaturizedData = hashingTF.transform(feature_data)

# TF-IDF
idf = IDF(inputCol="rawfeatures", outputCol="features")
idfModel = idf.fit(HTFfeaturizedData)
TFIDFfeaturizedData = idfModel.transform(HTFfeaturizedData)
TFIDFfeaturizedData.name = 'TFIDFfeaturizedData'

#rename the HTF features to features to be consistent
HTFfeaturizedData = HTFfeaturizedData.withColumnRenamed("rawfeatures","features")
HTFfeaturizedData.name = 'HTFfeaturizedData' #We will use later for printing

In [23]:
# Word2Vec
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features")
model = word2Vec.fit(feature_data)

W2VfeaturizedData = model.transform(feature_data)
# W2VfeaturizedData.show(1,False)

# W2Vec Dataframes typically has negative values so we will correct for that here so that we can use the Naive Bayes classifier
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(W2VfeaturizedData)

# rescale each feature to range [min, max].
scaled_data = scalerModel.transform(W2VfeaturizedData)
scaled_data
W2VfeaturizedData = scaled_data.select('TITLE','CATEGORY','label','scaledFeatures')
W2VfeaturizedData = W2VfeaturizedData.withColumnRenamed('scaledFeatures','features')

W2VfeaturizedData.name = 'W2VfeaturizedData' # We will need this to print later

In [24]:
def ClassTrainEval(classifier,features,classes,train,test):

    def FindMtype(classifier):
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(classifier)
    

    def IntanceFitModel(Mtype,classifier,classes,features,train):
        
        if Mtype == "OneVsRest":
            # instantiate the base classifier.
            lr = LogisticRegression()
            # instantiate the One Vs Rest Classifier.
            OVRclassifier = OneVsRest(classifier=lr)
#             fitModel = OVRclassifier.fit(train)
            # Add parameters of your choice here:
            paramGrid = ParamGridBuilder() \
                .addGrid(lr.regParam, [0.1, 0.01]) \
                .build()
            #Cross Validator requires the following parameters:
            crossval = CrossValidator(estimator=OVRclassifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=2) # 3 is best practice
            # Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
        if Mtype == "MultilayerPerceptronClassifier":
            # specify layers for the neural network:
            # input layer of size features, two intermediate of features+1 and same size as features
            # and output of size number of classes
            # Note: crossvalidator cannot be used here
            features_count = len(features[0][0])
            layers = [features_count, features_count+1, features_count, classes]
            MPC_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
            fitModel = MPC_classifier.fit(train)
            return fitModel
        if Mtype in("LinearSVC","GBTClassifier") and classes != 2: # These classifiers currently only accept binary classification
            print(Mtype," could not be used because PySpark currently only accepts binary classification data for this algorithm")
            return
        if Mtype in("LogisticRegression","NaiveBayes","RandomForestClassifier","GBTClassifier","LinearSVC","DecisionTreeClassifier"):
  
            # Add parameters of your choice here:
            if Mtype in("LogisticRegression"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .addGrid(classifier.maxIter, [10, 15,20])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("NaiveBayes"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.smoothing, [0.0, 0.2, 0.4, 0.6]) \
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("RandomForestClassifier"):
                paramGrid = (ParamGridBuilder() \
                               .addGrid(classifier.maxDepth, [2, 5, 10])
#                                .addGrid(classifier.maxBins, [5, 10, 20])
#                                .addGrid(classifier.numTrees, [5, 20, 50])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("GBTClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
#                              .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .addGrid(classifier.maxIter, [10, 15,50,100])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("LinearSVC"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.maxIter, [10, 15]) \
                             .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .build())
            
            # Add parameters of your choice here:
            if Mtype in("DecisionTreeClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
                             .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .build())
            
            #Cross Validator requires all of the following parameters:
            crossval = CrossValidator(estimator=classifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=2) # 3 + is best practice
            # Fit Model: Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
    
    fitModel = IntanceFitModel(Mtype,classifier,classes,features,train)
    
    # Print feature selection metrics
    if fitModel is not None:
        
        if Mtype in("OneVsRest"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype + '\033[0m')
            # Extract list of binary models
            models = BestModel.models
            for model in models:
                print('\033[1m' + 'Intercept: '+ '\033[0m',model.intercept,'\033[1m' + '\nCoefficients:'+ '\033[0m',model.coefficients)

        if Mtype == "MultilayerPerceptronClassifier":
            print("")
            print('\033[1m' + Mtype," Weights"+ '\033[0m')
            print('\033[1m' + "Model Weights: "+ '\033[0m',fitModel.weights.size)
            print("")

        if Mtype in("DecisionTreeClassifier", "GBTClassifier","RandomForestClassifier"):
            # FEATURE IMPORTANCES
            # Estimate of the importance of each feature.
            # Each feature’s importance is the average of its importance across all trees 
            # in the ensemble The importance vector is normalized to sum to 1. 
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Feature Importances"+ '\033[0m')
            print("(Scores add up to 1)")
            print("Lowest score is the least important")
            print(" ")
            print(BestModel.featureImportances)
            
            if Mtype in("DecisionTreeClassifier"):
                global DT_featureimportances
                DT_featureimportances = BestModel.featureImportances.toArray()
                global DT_BestModel
                DT_BestModel = BestModel
            if Mtype in("GBTClassifier"):
                global GBT_featureimportances
                GBT_featureimportances = BestModel.featureImportances.toArray()
                global GBT_BestModel
                GBT_BestModel = BestModel
            if Mtype in("RandomForestClassifier"):
                global RF_featureimportances
                RF_featureimportances = BestModel.featureImportances.toArray()
                global RF_BestModel
                RF_BestModel = BestModel

        if Mtype in("LogisticRegression"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficient Matrix"+ '\033[0m')
            print("You should compares these relative to eachother")
            print("Coefficients: \n" + str(BestModel.coefficientMatrix))
            print("Intercept: " + str(BestModel.interceptVector))
            global LR_coefficients
            LR_coefficients = BestModel.coefficientMatrix.toArray()
            global LR_BestModel
            LR_BestModel = BestModel

        if Mtype in("LinearSVC"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficients"+ '\033[0m')
            print("You should compares these relative to eachother")
            print("Coefficients: \n" + str(BestModel.coefficients))
            global LSVC_coefficients
            LSVC_coefficients = BestModel.coefficients.toArray()
            global LSVC_BestModel
            LSVC_BestModel = BestModel
        
   
    # Set the column names to match the external results dataframe that we will join with later:
    columns = ['Classifier', 'Result']
    
    if Mtype in("LinearSVC","GBTClassifier") and classes != 2:
        Mtype = [Mtype] # make this a list
        score = ["N/A"]
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
    else:
        predictions = fitModel.transform(test)
        MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # redictionCol="prediction",
        accuracy = (MC_evaluator.evaluate(predictions))*100
        Mtype = [Mtype] # make this a string
        score = [str(accuracy)] #make this a string and convert to a list
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
        result = result.withColumn('Result',result.Result.substr(0, 5))
        
    return result
    #Also returns the fit model important scores or p values

In [35]:
# from pyspark.ml.classification import *
# from pyspark.ml.evaluation import *
# from pyspark.sql import functions
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Comment out Naive Bayes if your data still contains negative values
classifiers = [
                LogisticRegression()
                ,OneVsRest()
               ,LinearSVC()
               ,NaiveBayes()
               ,RandomForestClassifier()
               ,GBTClassifier()
               ,DecisionTreeClassifier()
               ,MultilayerPerceptronClassifier()
              ] 

featureDF_list = [HTFfeaturizedData,TFIDFfeaturizedData,W2VfeaturizedData]

In [36]:
for featureDF in featureDF_list:
    print('\033[1m' + featureDF.name," Results:"+ '\033[0m')
    train, test = featureDF.randomSplit([0.7, 0.3],seed = 11)
    features = featureDF.select(['features']).collect()
    # Learn how many classes there are in order to specify evaluation type based on binary or multi and turn the df into an object
    class_count = featureDF.select(countDistinct("label")).collect()
    classes = class_count[0][0]

    #set up your results table
    columns = ['Classifier', 'Result']
    vals = [("Place Holder","N/A")]
    results = spark.createDataFrame(vals, columns)

    for classifier in classifiers:
        new_result = ClassTrainEval(classifier,features,classes,train,test)
        results = results.union(new_result)
    results = results.where("Classifier!='Place Holder'")
    print(results.show(10,truncate=False))

[1mHTFfeaturizedData  Results:[0m
 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[-0.11691609,  0.10138117,  0.19504664,  0.10715306,  0.12025501,
               0.08651916,  0.13521009,  0.04111269,  0.13934382,  0.17979542,
               0.16423726,  0.19798202,  0.06275585,  0.00541656,  0.20581733,
              -0.07753202,  0.04680764, -0.04383274, -0.04489838,  0.25238338],
             [ 0.14651825, -0.24506996,  0.02623206, -0.15583047, -0.00321263,
              -0.06969515, -0.06923569,  0.07052051, -0.08566868, -0.06659006,
              -0.07268842,  0.06184688,  0.08103145,  0.02106505, -0.02687312,
              -0.07816375, -0.02432487,  0.19753572,  0.00080057,  0.00317896],
             [ 0.06232509,  0.1554158 , -0.09452352,  0.10882901,  0.01683968,
               0.09936838,  0.0400252 , -0.05018513, -0.03144719, -0.13456191,
              -0.01436535, -0.11693092, -0.27718187, -0.2052

LinearSVC  could not be used because PySpark currently only accepts binary classification data for this algorithm
 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.05985291329584534,0.05984394036801723,0.04667810898722498,0.044461151987755826,0.039472217846159285,0.04219370867920801,0.039074180033995286,0.037616328509485,0.03829690828666527,0.04142603184685556,0.04527608893619426,0.05158377818770258,0.05024555109117701,0.05447980269998234,0.04967125040066798,0.05381987203780251,0.03557993637313015,0.04891268949114852,0.10336174431706699,0.05815379662391603])
GBTClassifier  could not be used because PySpark currently only accepts binary classification data for this algorithm
 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,5,6,11,12,14,17,18,19],[0.18107397559639657,0.13146749306664848,0.11936

In [24]:
feature_data.show(20)

+--------------------+--------+--------------------+--------------------+-----+
|               TITLE|CATEGORY|               words|            filtered|label|
+--------------------+--------+--------------------+--------------------+-----+
|fed official says...|       b|[fed, official, s...|[fed, official, s...|  1.0|
|feds charles plos...|       b|[feds, charles, p...|[feds, charles, p...|  1.0|
|us open stocks fa...|       b|[us, open, stocks...|[us, open, stocks...|  1.0|
|fed risks falling...|       b|[fed, risks, fall...|[fed, risks, fall...|  1.0|
|feds plosser nast...|       b|[feds, plosser, n...|[feds, plosser, n...|  1.0|
|plosser fed may h...|       b|[plosser, fed, ma...|[plosser, fed, ma...|  1.0|
|feds plosser tape...|       b|[feds, plosser, t...|[feds, plosser, t...|  1.0|
|feds plosser expe...|       b|[feds, plosser, e...|[feds, plosser, e...|  1.0|
|us jobs growth la...|       b|[us, jobs, growth...|[us, jobs, growth...|  1.0|
|ecb unlikely to e...|       b|[ecb, unl

In [34]:

cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(feature_data)
countVectorizer_feateures = model.transform(feature_data)

(trainingData, testData) = countVectorizer_feateures.randomSplit([0.7, 0.3],seed = 11)

nb = NaiveBayes(modelType="multinomial",labelCol="label", featuresCol="features")
nbModel = nb.fit(trainingData)
nb_predictions = nbModel.transform(testData)

nb_predictions.select("prediction", "features", 'TITLE', 'CATEGORY').show(5)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Accuracy of NaiveBayes is = %g"% (nb_accuracy))
print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy))

+----------+--------------------+--------------------+--------+
|prediction|            features|               TITLE|CATEGORY|
+----------+--------------------+--------------------+--------+
|       0.0|(26792,[93,285,17...| acm awards winne...|       e|
|       1.0|(26792,[3,54,76,1...| and  year mortga...|       b|
|       2.0|(26792,[664,4758,...| are finalists fo...|       m|
|       2.0|(26792,[207,494,6...| are now finalist...|       m|
|       0.0|(26792,[16,85,353...| arrests made dur...|       e|
+----------+--------------------+--------------------+--------+
only showing top 5 rows

Accuracy of NaiveBayes is = 0.90724
Test Error of NaiveBayes = 0.0927603 
