In [1]:
from collections import Counter
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import NaiveBayes
from pyspark.ml.linalg import Vector as MLVector, Vectors as MLVectors
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors
from pyspark.mllib.regression import LabeledPoint
import numpy as np
import pandas as pd

In [2]:
scRdd = sc.textFile('/FileStore/tables/C*') \
  .map(lambda x: x.split(',')) \
  .map(lambda x: Row(x[0], x[1]))

In [3]:
cols_name = ['label', 'text']
train, test = scRdd.randomSplit([0.7, 0.3])
trainDf, testDf = train.toDF(cols_name), test.toDF(cols_name)

In [4]:
tokenizer = Tokenizer().setInputCol('text').setOutputCol('words')
hashingTF = HashingTF().setInputCol('words').setOutputCol('rawFeatures').setNumFeatures(500000)
pipeline = Pipeline().setStages([tokenizer, hashingTF])
model = pipeline.fit(trainDf)
trainDf = model.transform(trainDf)
idf = IDF().setInputCol('rawFeatures').setOutputCol('features')
idfModel = idf.fit(trainDf)
rescaleData = idfModel.transform(trainDf)

In [5]:
trainData = rescaleData.select('label','features') \
  .rdd \
  .map(lambda x: LabeledPoint(float(x[0]), MLLibVectors.fromML(x[1])))

In [6]:
NBmodel = NaiveBayes().train(trainData, 1.0)
testDf = model.transform(testDf)
testData = idfModel.transform(testDf)
testData = testData.select('label','features') \
  .rdd \
  .map(lambda x: LabeledPoint(float(x[0]), MLLibVectors.fromML(x[1])))

In [7]:
predData = testData.map(lambda x: (NBmodel.predict(x.features), x.label))
preDAccracy = 1.0 * predData.filter(lambda x: x[0] == x[1]).count() / testData.count()

In [8]:
print(preDAccracy)