In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("lrhp").enableHiveSupport().getOrCreate()

df = spark.read.load('/container-data/winequality-white.csv',
                     format="csv", sep=";", inferSchema="true", header="true")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/07 09:36:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:

#transform into [label, [features]] format
from pyspark.ml.feature import VectorAssembler
feature_columns = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

In [3]:
from pyspark.sql.functions import col,row_number

df_format = df.select("features", "quality")
df_format = df_format.withColumnRenamed("quality", "label")
df_format.show()

training , test = df_format.randomSplit([0.8, 0.2], seed = 2018)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.0,0.27,0.36,20...|    6|
|[6.3,0.3,0.34,1.6...|    6|
|[8.1,0.28,0.4,6.9...|    6|
|[7.2,0.23,0.32,8....|    6|
|[7.2,0.23,0.32,8....|    6|
|[8.1,0.28,0.4,6.9...|    6|
|[6.2,0.32,0.16,7....|    6|
|[7.0,0.27,0.36,20...|    6|
|[6.3,0.3,0.34,1.6...|    6|
|[8.1,0.22,0.43,1....|    6|
|[8.1,0.27,0.41,1....|    5|
|[8.6,0.23,0.4,4.2...|    5|
|[7.9,0.18,0.37,1....|    5|
|[6.6,0.16,0.4,1.5...|    7|
|[8.3,0.42,0.62,19...|    5|
|[6.6,0.17,0.38,1....|    7|
|[6.3,0.48,0.04,1....|    6|
|[6.2,0.66,0.48,1....|    8|
|[7.4,0.34,0.42,1....|    6|
|[6.5,0.31,0.14,7....|    5|
+--------------------+-----+
only showing top 20 rows



In [6]:
#seach for the best hyperpara 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [1, 10, 50]) \
                                .addGrid(lr.regParam, [0.01, 0.05, 0.75, 1.0]) \
                                .addGrid(lr.elasticNetParam, [1]) \
                                .build()
evaluator = MulticlassClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
    parallelism=5)
cv.setNumFolds(5)
cvModel = cv.fit(training)
cvModel.getNumFolds()

21/12/07 09:42:00 WARN CacheManager: Asked to cache already cached data.
21/12/07 09:42:00 WARN CacheManager: Asked to cache already cached data.


5

In [7]:

#extract the best parameters 
bestModel = cvModel.bestModel
print ('Best Param (regParam): ', bestModel._java_obj.getRegParam())
print ('Best Param (MaxIter): ', bestModel._java_obj.getMaxIter())
print ('Best Param (elasticNetParam): ', bestModel._java_obj.getElasticNetParam())
cvModel.write().overwrite().save("LrModel")

Best Param (regParam):  0.01
Best Param (MaxIter):  50
Best Param (elasticNetParam):  1.0


                                                                                

In [None]:
from pyspark.ml.tuning import CrossValidatorModel

#test the tunned model 
loadedModel = CrossValidatorModel.load("LrModel")
loadedModel = loadedModel.bestModel
predictions = loadedModel.transform(test)
predictions.show(1)
trainingSummary = predictions.summary
test.show(1)

In [None]:
#get root mean squared error 
from pyspark.ml.evaluation import RegressionEvaluator

dataset = predictions.select('label', 'prediction')
evaluator = RegressionEvaluator()
evaluator.setPredictionCol("prediction")
evaluator.evaluate(dataset, {evaluator.metricName: "rmse"})