In [27]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("lrhp").enableHiveSupport().getOrCreate()

df = spark.read.load('/container-data/winequality-white.csv',
                     format="csv", sep=";", inferSchema="true", header="true")

#transform into [label, [features]] format
from pyspark.ml.feature import VectorAssembler
feature_columns = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)
from pyspark.sql.functions import col,row_number

df_format = df.select("features", "quality")
df_format = df_format.withColumnRenamed("quality", "label")
df_format.show()

training , test = df_format.randomSplit([0.8, 0.2], seed = 2018)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.0,0.27,0.36,20...|    6|
|[6.3,0.3,0.34,1.6...|    6|
|[8.1,0.28,0.4,6.9...|    6|
|[7.2,0.23,0.32,8....|    6|
|[7.2,0.23,0.32,8....|    6|
|[8.1,0.28,0.4,6.9...|    6|
|[6.2,0.32,0.16,7....|    6|
|[7.0,0.27,0.36,20...|    6|
|[6.3,0.3,0.34,1.6...|    6|
|[8.1,0.22,0.43,1....|    6|
|[8.1,0.27,0.41,1....|    5|
|[8.6,0.23,0.4,4.2...|    5|
|[7.9,0.18,0.37,1....|    5|
|[6.6,0.16,0.4,1.5...|    7|
|[8.3,0.42,0.62,19...|    5|
|[6.6,0.17,0.38,1....|    7|
|[6.3,0.48,0.04,1....|    6|
|[6.2,0.66,0.48,1....|    8|
|[7.4,0.34,0.42,1....|    6|
|[6.5,0.31,0.14,7....|    5|
+--------------------+-----+
only showing top 20 rows



In [15]:
from pyspark.ml.tuning import CrossValidatorModel

#test the tunned mode LR
loadedModel = CrossValidatorModel.load("LrModel")
loadedModel = loadedModel.bestModel
predictions = loadedModel.transform(test)
predictions.show(1)
trainingSummary = predictions.summary
test.show(1)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[3.8,0.31,0.02,11...|    6|[-9.8504628687002...|[2.63291837375996...|       6.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 1 row

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[3.8,0.31,0.02,11...|    6|
+--------------------+-----+
only showing top 1 row



In [5]:

#extract the best parameters 
print ('Best Param (regParam): ', loadedModel._java_obj.getRegParam())
print ('Best Param (MaxIter): ', loadedModel._java_obj.getMaxIter())
print ('Best Param (elasticNetParam): ', loadedModel._java_obj.getElasticNetParam())


Best Param (regParam):  0.01
Best Param (MaxIter):  50
Best Param (elasticNetParam):  1.0


In [6]:
#get root mean squared error 
from pyspark.ml.evaluation import RegressionEvaluator

dataset = predictions.select('label', 'prediction')
evaluator = RegressionEvaluator()
evaluator.setPredictionCol("prediction")
evaluator.evaluate(dataset, {evaluator.metricName: "rmse"})

0.8149958638742985

In [None]:
############################testing random forest now

In [28]:
#test the tunned model 
from pyspark.ml.tuning import CrossValidatorModel

loadedModel = CrossValidatorModel.load("RfModel")
loadedModel = loadedModel.bestModel
predictions = loadedModel.transform(test)
predictions.show(1)

trainingSummary = predictions.summary
test.show(1)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[3.8,0.31,0.02,11...|    6|[0.0,0.0,0.0,0.01...|[0.0,0.0,0.0,0.00...|       6.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 1 row

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[3.8,0.31,0.02,11...|    6|
+--------------------+-----+
only showing top 1 row



In [29]:
print ('Best Param (numTrees): ', loadedModel.getNumTrees )


Best Param (numTrees):  5


In [30]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import DoubleType

predictions_format = predictions.select("prediction", "label")
predictions_format = predictions_format.withColumn("label", predictions_format["label"].cast("double"))
predictions_format.show(1)

+----------+-----+
|prediction|label|
+----------+-----+
|       6.0|  6.0|
+----------+-----+
only showing top 1 row



In [31]:
rdd = predictions_format.rdd.map(lambda x: (x[0], x[1]))

for element in rdd.take(5):
    print(element)

(6.0, 6.0)
(5.0, 5.0)
(7.0, 7.0)
(7.0, 5.0)
(6.0, 7.0)


In [33]:
metrics = MulticlassMetrics(rdd)

for x in range(1, 11, 1):
    try:
        print('Lable: ', x)
        print('fMeasure: ',  metrics.fMeasure(float(x),1.0))
    except:
        print('fMeasure: 0.00')

Lable:  1
fMeasure: 0.00
Lable:  2
fMeasure: 0.00
Lable:  3
fMeasure:  0.0
Lable:  4
fMeasure:  0.0
Lable:  5
fMeasure:  0.5563909774436091
Lable:  6
fMeasure:  0.6253521126760564
Lable:  7
fMeasure:  0.31578947368421056
Lable:  8
fMeasure:  0.0
Lable:  9
fMeasure:  0.0
Lable:  10
fMeasure: 0.00
