In [2]:
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.appName("lreg").getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression

In [5]:
training = spark.read.format("libsvm").load("../../Data/sample_linear_regression_data.txt")

In [6]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

# create instance of our model

In [8]:
lr = LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction")


In [9]:
lr_model = lr.fit(training)

In [12]:
lr_model.coefficients
# coef for the linear regression

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [14]:
lr_model.intercept
# intercept 

0.14228558260358093

In [16]:
# we can use those metrics to extract features importances  (Coefficients, intercept)

In [17]:
training_summary = lr_model.summary

In [18]:
training_summary.rootMeanSquaredError

10.16309157133015

# Split 

In [19]:
all_data = spark.read.format("libsvm").load("../../Data/sample_linear_regression_data.txt")

In [26]:
#split 
train_data, test_data = all_data.randomSplit([0.7,0.3])

In [27]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                353|
|   mean|-0.2404994810609121|
| stddev| 10.609507250564665|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [28]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                148|
|   mean|  1.443227140095893|
| stddev|  9.516202165931066|
|    min|-19.402336030214553|
|    max| 27.111027963108548|
+-------+-------------------+



# Correct Way and correct Model

In [31]:
correct_model = lr.fit(train_data)

In [32]:
test_resu = correct_model.evaluate(test_data)

In [34]:
test_resu.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-19.909161708369673|
|-16.536601957597792|
| -16.54001925420975|
|-16.414572173625977|
|-14.502470064808492|
|-16.329797592619073|
|-13.168920211875854|
|-15.116083957020058|
|-11.383138594529914|
|-16.945923684401528|
| -11.98560045132274|
|-13.559196950960525|
|-11.016875788555955|
|-14.659007555378132|
|-13.756618594496977|
|-11.919696983256237|
| -7.281685876740565|
|-7.1663998091734005|
| -8.921436586759455|
| -5.138091825262989|
+-------------------+
only showing top 20 rows



In [36]:
test_resu.rootMeanSquaredError

9.796760024855576

In [38]:
# test the model

In [39]:
unlabeled_Data = test_data.select("features")

In [40]:
unlabeled_Data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [41]:
prediction = correct_model.transform(unlabeled_Data)

In [44]:
prediction.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...| 0.5068256781551213|
|(10,[0,1,2,3,4,5,...| -2.631690665365968|
|(10,[0,1,2,3,4,5,...|-0.9541811026735934|
|(10,[0,1,2,3,4,5,...|-0.3045246599791104|
|(10,[0,1,2,3,4,5,...| -1.449042500986081|
|(10,[0,1,2,3,4,5,...| 0.9702527127863947|
|(10,[0,1,2,3,4,5,...|-2.1420603775404348|
|(10,[0,1,2,3,4,5,...|0.05960098247762363|
|(10,[0,1,2,3,4,5,...| -2.483949300628854|
|(10,[0,1,2,3,4,5,...|  3.173482122698658|
|(10,[0,1,2,3,4,5,...|-1.1677351550427897|
|(10,[0,1,2,3,4,5,...|  1.361100386299112|
|(10,[0,1,2,3,4,5,...|-1.1134774237319733|
|(10,[0,1,2,3,4,5,...| 2.8319345589855596|
|(10,[0,1,2,3,4,5,...|  2.324816228956519|
|(10,[0,1,2,3,4,5,...|  1.336385683388107|
|(10,[0,1,2,3,4,5,...|-3.0120281639153594|
|(10,[0,1,2,3,4,5,...| -3.067039777779753|
|(10,[0,1,2,3,4,5,...|-1.2667207884316554|
|(10,[0,1,2,3,4,5,...| -4.903261287317467|
+----------

# Evaluation Linear Regression 

## Accuracy Recall are usefull for regression proglemes continues data.

### Mean Absolute Error  MAE  === >  sum ( | yi - yi^| ) / n  (n all observations )

### Mean Squared Error  MSE == >   sum ( ( yi - yi^ ) **2  / n  (n all observations ) 
penishe more the large difference between y and yi example (-5 +10) == 25 or in MAE == 5 

### Root mean Square Error RMSE  ==> Racine (MSE) 
the most popular being the same unit as  y the most usedd 

### R Squared Value know also as the coefficient of determination 

in basic sense it is measure of how much variance your model explain for, between [0,1] . We can also obtaining it using Adjasted R squared we can use it to compare models between them