In [32]:
import findspark
findspark.init()

In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression

In [34]:
spark_session = SparkSession.builder.appName('LinReg').getOrCreate()

In [35]:
data = spark_session.sparkContext.textFile('../../data/regression.txt') \
                                 .map(lambda x: x.split(',')) \
                                 .map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

cols = ['label', 'features']
df = data.toDF(cols)

df.count()

1000

In [36]:
train_test_split = df.randomSplit([0.8, 0.2])
training_df = train_test_split[0]
test_df = train_test_split[1]

lr_model = LinearRegression(maxIter=50, regParam=0.01, elasticNetParam=0.8)
trained_lr_model = lr_model.fit(training_df)

In [37]:
full_predictions = trained_lr_model.transform(test_df).cache()
full_predictions.show(10)

predictions = full_predictions.select('prediction').rdd.map(lambda x: x[0])
labels = full_predictions.select('label').rdd.map(lambda x: x[0])

+-----+--------+-------------------+
|label|features|         prediction|
+-----+--------+-------------------+
|-2.58|  [2.57]|-2.5207517878917396|
|-2.54|  [2.39]|-2.3439858623274694|
|-2.07|  [2.04]| -2.000274340396943|
|-1.94|  [1.98]|-1.9413523652088531|
|-1.91|  [1.86]|-1.8235084148326728|
|-1.88|  [1.89]|-1.8529694024267178|
| -1.6|  [1.63]|-1.5976408432783271|
|-1.59|  [1.62]| -1.587820514080312|
|-1.39|  [1.32]|-1.2932106381398614|
|-1.37|  [1.25]| -1.224468333753756|
+-----+--------+-------------------+
only showing top 10 rows



In [38]:
prediction_and_label = predictions.zip(labels).collect()
prediction_and_label[:10]

[(-2.5207517878917396, -2.58),
 (-2.3439858623274694, -2.54),
 (-2.000274340396943, -2.07),
 (-1.9413523652088531, -1.94),
 (-1.8235084148326728, -1.91),
 (-1.8529694024267178, -1.88),
 (-1.5976408432783271, -1.6),
 (-1.587820514080312, -1.59),
 (-1.2932106381398614, -1.39),
 (-1.224468333753756, -1.37)]

In [39]:
spark_session.stop()