In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, concat,count, desc, explode, lit, min, max, split, stddev, udf
from pyspark.sql.types import IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer, IDF, Normalizer, PCA, RegexTokenizer, StandardScaler, StopWordsRemover, StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
stack_overflow_data = "/FileStore/tables/stack.json"
df2 = spark.read.json(stack_overflow_data)
df2.persist()


regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=10000)
idf = IDF(inputCol="TF", outputCol="features") # Inverse document frequency 
indexer = StringIndexer(inputCol="oneTag", outputCol="label")

lr = LogisticRegression(maxIter=10, regParam=0.0, elasticNetParam=0)

pipeline = Pipeline(stages=[regexTokenizer, cv, idf, indexer, lr])

In [0]:
plrModel = pipeline.fit(df2)

df3 = plrModel.transform(df2)
df3.head()

df3.filter(df3.label == df3.prediction).count()

Out[6]: 68955