In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [4]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer

indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"],
                                 outputCol = "features")

normalizer = Normalizer(inputCol="features", outputCol="features_morm", p=1.0)

In [6]:
from pyspark.ml.classification import LogisticRegression

In [7]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [8]:
from pyspark.ml import Pipeline

In [9]:
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, lr])

In [12]:
model = pipeline.fit(df_train)

In [13]:
prediction = model.transform(df_train)

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
eval = MulticlassClassificationEvaluator().setMetricName('accuracy').setLabelCol('label').setPredictionCol('prediction')

In [16]:
eval.evaluate(prediction)

0.20671266813162203

In [17]:
model = pipeline.fit(df_test)

In [18]:
prediction = model.transform(df_test)

In [19]:
eval.evaluate(prediction)

0.2061616681971913