# Linear regression using spark ML 

In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml.regression import LinearRegression

In [12]:
# vector is a data structure used by linear regression in spark ML
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.appName("linear regression").getOrCreate()


Lets read the rdd from text file

In [13]:
dataset = spark.sparkContext.textFile("./datasets/regression.txt")

In [14]:
regression_data = dataset.map(lambda x : x.split(",")).map(lambda field: (float(field[0]) , Vectors.dense(float(field[1])) ))

In [129]:
#use createDataFrame only when using spark sql commands

regression_df = regression_data.toDF(['label','features'])
regression_df.sample(False,0.01).show()

+-----+--------+
|label|features|
+-----+--------+
|-0.84|  [0.83]|
| 0.43| [-0.47]|
| 0.34|  [-0.4]|
| -0.6|  [0.56]|
|-0.78|  [0.95]|
|-1.34|  [1.33]|
| 0.44| [-0.38]|
| -0.6|  [0.35]|
| 0.91| [-0.95]|
|-1.36|  [1.37]|
|-1.39|  [1.44]|
| 0.87| [-1.02]|
| 0.51| [-0.38]|
+-----+--------+



### SPlit the data into training and testing data

In [142]:
trainTest = regression_df.randomSplit([0.5,0.5])
len(trainTest)

2

In [144]:
train_data= trainTest[0]
test_data = trainTest[1]

In [147]:
lir = LinearRegression(maxIter=10,regParam=0.3, elasticNetParam=0.8)

In [148]:
model = lir.fit(train_data)

In [149]:
full_predictions = model.transform(test_data).cache()

In [230]:
#sample
full_predictions.sample(False,0.001,20).collect()

[Row(label=-0.18, features=DenseVector([0.1]), prediction=-0.07016462343013741)]

In [239]:
prediction = full_predictions.select('prediction').rdd.map(lambda x: x[0])
labels= full_predictions.select('label').rdd.map(lambda x: x[0])

In [243]:
results = prediction.zip(labels)

In [247]:
for result in results.collect():
    print("prediction %.10f , for label - %.2f" % result)

prediction -2.7120197082 , for label - -3.74
prediction -1.7276572656 , for label - -2.54
prediction -1.6263258377 , for label - -2.26
prediction -1.4236629819 , for label - -2.09
prediction -1.4743286959 , for label - -2.07
prediction -1.4309009410 , for label - -1.96
prediction -1.4019491045 , for label - -1.94
prediction -1.4309009410 , for label - -1.94
prediction -1.3440454314 , for label - -1.91
prediction -1.4309009410 , for label - -1.87
prediction -1.1992862487 , for label - -1.77
prediction -1.2210001261 , for label - -1.75
prediction -1.1992862487 , for label - -1.74
prediction -1.1848103304 , for label - -1.66
prediction -1.1775723713 , for label - -1.65
prediction -1.3295695131 , for label - -1.64
prediction -1.1775723713 , for label - -1.60
prediction -1.1703344121 , for label - -1.59
prediction -1.1486205347 , for label - -1.42
prediction -0.9531956381 , for label - -1.40
prediction -0.9531956381 , for label - -1.39
prediction -0.9025299241 , for label - -1.37
prediction