In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('mylogreg').getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
my_data = spark.read.format("libsvm").load("../data/sample_libsvm_data.txt")

In [5]:
my_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [6]:
my_log_reg_model = LogisticRegression()

In [7]:
fitted_logreg = my_log_reg_model.fit(my_data)

In [8]:
log_summary = fitted_logreg.summary

In [9]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [10]:
lr_train, lr_test = my_data.randomSplit([0.7, 0.3])

In [11]:
final_model = LogisticRegression()

In [12]:
fit_final = final_model.fit(lr_train)

In [13]:
prediction_and_labels = fit_final.evaluate(lr_test)

In [14]:
prediction_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[125,126,127...|[21.9776302291733...|[0.99999999971474...|       0.0|
|  0.0|(692,[126,127,128...|[20.8906952449623...|[0.99999999915416...|       0.0|
|  0.0|(692,[126,127,128...|[26.4573201021891...|[0.99999999999676...|       0.0|
|  0.0|(692,[126,127,128...|[30.5092617226301...|[0.99999999999994...|       0.0|
|  0.0|(692,[127,128,129...|[18.3323144408453...|[0.99999998907611...|       0.0|
|  0.0|(692,[128,129,130...|[18.8417578882809...|[0.99999999343660...|       0.0|
|  0.0|(692,[150,151,152...|[24.4966346226944...|[0.99999999997702...|       0.0|
|  0.0|(692,[152,153,154...|[31.7154410369020...|[0.99999999999998...|       0.0|
|  0.0|(692,[154,155,156...|[18.4419445868181...|[0.99999999021039...|       0.0|
|  0.0|(692,[154

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [16]:
my_eval = BinaryClassificationEvaluator()

In [17]:
my_final_roc = my_eval.evaluate(prediction_and_labels.predictions)

In [18]:
my_final_roc

1.0