# Pyspark for Linear Regression

### 1. Set up spark context and SparkSession

In [1]:
from pyspark import SparkConf, SparkContext
## set up spark context
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
## set up  SparkSession
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### 2. Load dataset

In [2]:
df = sqlContext.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("./data/Advertising.csv",header=True);

- check the dataset and the Schema

In [3]:
df.take(2)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



### 3. Convert the data to dense vector

In [4]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [5]:
# convert the data to dense vector
def transData(row):
    return Row(label=row["Sales"],
               features=Vectors.dense([
                                       row["TV"],
                                       row["Radio"],
                                       row["Newspaper"]]))

### 4. Transform the dataset to DataFrame

In [6]:
transformed = df.rdd.map(transData).toDF() 
transformed.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
|  [8.7,48.9,75.0]|  7.2|
| [57.5,32.8,23.5]| 11.8|
|[120.2,19.6,11.6]| 13.2|
|    [8.6,2.1,1.0]|  4.8|
| [199.8,2.6,21.2]| 10.6|
|  [66.1,5.8,24.2]|  8.6|
| [214.7,24.0,4.0]| 17.4|
| [23.8,35.1,65.9]|  9.2|
|   [97.5,7.6,7.2]|  9.7|
|[204.1,32.9,46.0]| 19.0|
|[195.4,47.7,52.9]| 22.4|
|[67.8,36.6,114.0]| 12.5|
|[281.4,39.6,55.8]| 24.4|
| [69.2,20.5,18.3]| 11.3|
|[147.3,23.9,19.1]| 14.6|
+-----------------+-----+
only showing top 20 rows



### 5. Fit model (Ridge Regression and the LASSO)

In [18]:
# Import LinearRegression class
from pyspark.ml.regression import LinearRegression

# Define LinearRegression algorithm
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(transformed, {lr.regParam:0.0})
#modelB = lr.fit(transformed, {lr.regParam:1.0})

In [19]:
modelA.coefficients

DenseVector([0.0458, 0.1885, -0.001])

In [20]:
 modelA.intercept

2.9388893694594134

In [None]:
 # Make predictions
predictionsA = modelA.transform(transformed)
predictionsA.show()

### 6. Evaluation

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print("ModelA: Root Mean Squared Error = " + str(RMSE))

In [None]:
predictionsB = modelB.transform(transformed)
predictionsB.show()

In [None]:
RMSE = evaluator.evaluate(predictionsB)
print("ModelB: Root Mean Squared Error = " + str(RMSE))

### 7. Visualization

In [None]:
# Import numpy, pandas, and ggplot
import numpy as np
from pandas import *
from ggplot import *
 
# Create Python DataFrame
pop = transformed.rdd.map(lambda p: (p.features[0])).collect()
sales = transformed.rdd.map(lambda p: (p.label)).collect()
predA = predictionsA.select("prediction").rdd.map(lambda r: r[0]).collect()
predB = predictionsB.select("prediction").rdd.map(lambda r: r[0]).collect()



pydf = DataFrame([predA]) 
nx,ny = pydf.shape
type1 = Series([0 for x in range(ny)])
type2 = Series([1 for x in range(ny)])

#pydf
# pandas DataFrame    
pydf1 = DataFrame({'pop':pop,'sales':sales,'pred':predA,'type':type1})
pydf2 = DataFrame({'pop':pop,'sales':sales,'pred':predB,'type':type2})

frames = [pydf1, pydf2]

result = pd.concat(frames)
result['type'] = result['type'].astype(object)
result

In [None]:
# Create scatter plot and two regression models (scaling exponential) using ggplot
ggplot(result, aes(x='pop',y='pred',color='type')) +\
geom_point(colors='blue') 

### 8. More features about the model

- build model

In [22]:
from pyspark.ml.linalg import Vectors

In [23]:
df = sqlContext.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("./data/Advertising.csv",header=True);

In [26]:
# convert the data to dense vector
def transData(row):
    return Row(label=row["Sales"],
               features=Vectors.dense([
                                       row["TV"],
                                       row["Radio"],
                                       row["Newspaper"]]))

In [24]:
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal")

In [41]:
transformed = df.rdd.map(transData).toDF() 
#transformed.show()

In [28]:
model = lr.fit(transformed)

- coefficients

In [42]:
model.coefficients

DenseVector([0.0458, 0.1885, -0.001])

- intersection

In [50]:
model.intercept

2.9388893694594134

- save and extract model

In [53]:
temp_path = 'temp/Users/wenqiangfeng/Dropbox/Spark/Code/model'
modelPath = temp_path + "/lr_model"
model.save(modelPath)

In [54]:
lr2 = model.load(modelPath)

In [55]:
lr2.coefficients

DenseVector([0.0458, 0.1885, -0.001])