# Create entry points to spark

In [2]:
!pip install pyspark
import pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 50 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 58.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=986405440c058ef84a17888c2ed8a16b30b01be1222e9e2edfb914ac161de682
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
#from pyspark import SparkContext
#sc = SparkContext(master = 'local')

#from pyspark.sql import SparkSession
#spark = SparkSession.builder \
#          .appName("Python Spark SQL basic example") \
#          .config("spark.some.config.option", "some-value") \
#          .getOrCreate()

In [3]:
from pyspark.sql import SparkSession

In [5]:
#  Create SparkSession object 'spark'
spark = SparkSession.builder.appName('Advertising').getOrCreate()

# Linear regression without cross-valiation

## Import data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
ad = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/PySpark-Learning Apache Spark/data/Advertising.csv', 
                    header=True, 
                    inferSchema=True)
ad.show(5)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows



In [7]:
ad.count()

200

In [8]:
ad.columns

['TV', 'Radio', 'Newspaper', 'Sales']

In [9]:
ad.describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|               0.0|               0.3|               1.6|
|    max|            296.4|              49.6|             114.0|              27.0|
+-------+-----------------+------------------+------------------+------------------+



In [15]:
ad.printSchema()

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



## Transform data structure

In [13]:
from pyspark.ml.linalg import Vectors

ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:3]), 
                              x[-1]]).toDF(['features', 
                                            'label'])

ad_df.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



## Build linear regression model

In [16]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', 
                      labelCol = 'label')

## Fit the model

In [17]:
lr_model = lr.fit(ad_df)

## Prediction

In [19]:
pred = lr_model.transform(ad_df)
pred.show()

+-----------------+-----+------------------+
|         features|label|        prediction|
+-----------------+-----+------------------+
|[230.1,37.8,69.2]| 22.1| 20.52397440971517|
| [44.5,39.3,45.1]| 10.4|12.337854820894362|
| [17.2,45.9,69.3]|  9.3|12.307670779994238|
|[151.5,41.3,58.5]| 18.5| 17.59782951168913|
|[180.8,10.8,58.4]| 12.9|13.188671856831297|
|  [8.7,48.9,75.0]|  7.2|12.478347634035858|
| [57.5,32.8,23.5]| 11.8|11.729759951563684|
|[120.2,19.6,11.6]| 13.2|12.122953165502278|
|    [8.6,2.1,1.0]|  4.8|3.7273408628615825|
| [199.8,2.6,21.2]| 10.6|12.550848722934683|
|  [66.1,5.8,24.2]|  8.6| 7.032299200558855|
| [214.7,24.0,4.0]| 17.4|17.285129182600265|
| [23.8,35.1,65.9]|  9.2|10.577120733627677|
|   [97.5,7.6,7.2]|  9.7| 8.826300480033197|
|[204.1,32.9,46.0]| 19.0|18.434366383561077|
|[195.4,47.7,52.9]| 22.4|20.819299516495455|
|[67.8,36.6,114.0]| 12.5| 12.82365674369938|
|[281.4,39.6,55.8]| 24.4| 23.22495715879901|
| [69.2,20.5,18.3]| 11.3| 9.951682059118799|
|[147.3,23

## Module evaluation

In [24]:
from pyspark.ml.evaluation import RegressionEvaluator 

evaluator = RegressionEvaluator(predictionCol='prediction', 
                                labelCol='label')

print("Model Accuracy: ", round(evaluator.setMetricName('r2').evaluate(pred) * 100, 2),'%', sep='')

Model Accuracy: 89.72%


In [27]:
print("RMSE: ", round(evaluator.setMetricName('rmse').evaluate(pred), 2), sep='')

RMSE: 1.67


## Compare results with R
The comparison below shows that the linear regression analyses from pyspark and R obtained very close results.

```{r}
# intercept and coefficients from R
advertise = read.csv('data/Advertising.csv', header = TRUE)
lr_ad = lm(Sales~., data = advertise)
lr_ad$coefficients

 (Intercept)           TV        Radio    Newspaper 
 2.938889369  0.045764645  0.188530017 -0.001037493
 
# intercept and coefficents from pyspark
lr_model.intercept

2.9388893694594134

lr_model.coefficients

DenseVector([0.0458, 0.1885, -0.001])

# R squared from R
summary(lr_ad)$r.squared

0.8972106

# R squared from pyspark
evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"})

0.897210638178952

```

# Linear regression with cross-validation

## Training and test datasets

In [28]:
training, test = ad_df.randomSplit([0.8, 0.2], 
                                   seed=123)

## Build cross-validation model

In [29]:
##=====build cross valiation model======

# estimator
lr = LinearRegression(featuresCol = 'features', 
                      labelCol = 'label')

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder

param_grid = ParamGridBuilder().\
    addGrid(lr.regParam, [0, 0.5, 1]).\
    addGrid(lr.elasticNetParam, [0, 0.5, 1]).\
    build()
    
# evaluator
evaluator = RegressionEvaluator(predictionCol='prediction',
                                labelCol='label', 
                                metricName='r2')

# cross-validation model
from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator, 
                    numFolds=4)

## Fit cross-validation model

In [30]:
cv_model = cv.fit(training)

## Prediction

In [31]:
pred_training_cv = cv_model.transform(training)

pred_test_cv = cv_model.transform(test)

## Evaluation

In [32]:
# performance on training data
evaluator.setMetricName('r2').evaluate(pred_training_cv)

0.8952845631627804

In [34]:
# performance on training data
evaluator.setMetricName('rmse').evaluate(pred_training_cv)

1.6446574438001584

In [33]:
# performance on test data
evaluator.setMetricName('r2').evaluate(pred_test_cv)

0.9013819610158471

In [35]:
# performance on test data
evaluator.setMetricName('rmse').evaluate(pred_test_cv)

1.775112420218023

## Intercept and coefficients

In [36]:
print('Intercept: ', cv_model.bestModel.intercept, "\n",
     'coefficients: ', cv_model.bestModel.coefficients)

Intercept:  2.9592600706772956 
 coefficients:  [0.04613729524909819,0.19200356629524284,-0.006269704193266747]


## Get parameter values from the best model

Parameters can be extracted by calling the java property.

In [37]:
print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" +
     'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam()))

best regParam: 0.0
best ElasticNetParam:0.0
