In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


In [2]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from numpy import array
import random

In [3]:
def gen_point(i):
    x1 = random.uniform(-100,100)
    x2 = random.uniform(-20,60)
    return LabeledPoint(x1 * 10.0 + x2 * 25 + random.uniform(-2,2), [x1, x2])

points = sc.parallelize(range(100000)).map(gen_point)

In [5]:
points.takeSample(False, 10)

[LabeledPoint(462.60989497432655, [9.841325699939645,14.49756272715451]),
 LabeledPoint(1591.6252684975461, [70.6072271747096,35.368375759316095]),
 LabeledPoint(619.536871542954, [48.05237318124338,5.570442841363146]),
 LabeledPoint(752.8459159457966, [-56.84967190279959,52.792385037356055]),
 LabeledPoint(785.8929053498109, [8.462417628149723,28.07903491689118]),
 LabeledPoint(586.6181946741294, [-66.24145027548053,50.03183840828778]),
 LabeledPoint(393.7584443341876, [-93.39975638091013,53.17312353839017]),
 LabeledPoint(994.5415910735985, [-22.13531135718965,48.65223756680683]),
 LabeledPoint(797.056554883269, [13.563035350256186,26.46759700820037]),
 LabeledPoint(947.7188492979727, [-15.384717395225309,44.083728815716356])]

## Do it the easy way

In [68]:
model = LinearRegressionWithSGD.train(points, 
            iterations=100, step=1e-4, intercept=True)

In [69]:
model

(weights=[9.986463388658139,20.664389001202885], intercept=1.441603502487762)

## Do it the hard way

In [46]:
weights = array([0, 0])
step = 1e-9

In [47]:
x = points.map(lambda p: p.features)

In [48]:
y = points.map(lambda p: p.label)

In [49]:
x.take(5)

[DenseVector([36.19, -17.7894]),
 DenseVector([-44.9662, 22.4089]),
 DenseVector([-63.2373, -13.5744]),
 DenseVector([-47.5876, 12.475]),
 DenseVector([-48.3765, 45.154])]

In [50]:
y.take(5)

[-82.65633322065673,
 108.9477495733118,
 -971.8825357907368,
 -162.2852731807697,
 643.3956409790013]

In [51]:
prediction = x.map(lambda point: point.dot(weights))

In [52]:
prediction.take(5)

[0.0, 0.0, 0.0, 0.0, 0.0]

In [53]:
gradient = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) * xy[0])

In [54]:
gradient.take(5)

[DenseVector([2991.3351, -1470.4091]),
 DenseVector([4898.9695, -2441.3955]),
 DenseVector([-61459.2443, -13192.6995]),
 DenseVector([-7722.7623, 2024.5096]),
 DenseVector([31125.2503, -29051.9066])]

In [55]:
gradient_average = gradient.mean()

In [56]:
weights = weights - step * gradient_average

In [57]:
weights

array([3.33345293e-05, 2.32651231e-05])

In [66]:
iterations = 100
weights = array([0, 0])
step = 1e-4

In [67]:
for i in range(iterations):
    gradient = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) * xy[0]).mean()
    weights = weights - step * gradient
    print(i, weights, gradient)

0 [3.34338017 2.33271049] [-33433.801675607014,-23327.10491051339]
1 [5.5692763  4.44762375] [-22258.961330514292,-21149.132563428702]
2 [7.05116234 6.36511986] [-14818.860362922802,-19174.961101355264]
3 [8.03769656 8.10365681] [-9865.342219347567,-17385.36952372951]
4 [8.69443493 9.67995679] [-6567.3837520212855,-15762.999789823285]
5 [ 9.1316039  11.10917276] [-4371.689662058521,-14292.159668400674]
6 [ 9.42259151 12.40503793] [-2909.8761130636267,-12958.651786251827]
7 [ 9.61625886 13.58000032] [-1936.6734723401921,-11749.623822365556]
8 [ 9.74513702 14.64534393] [-1288.7816256095034,-10653.436137910156]
9 [ 9.8308846  15.61129834] [-857.4758005190532,-9659.544058845551]
10 [ 9.88792135 16.4871376 ] [-570.3674956104275,-8758.392674175122]
11 [ 9.92584744 17.28126985] [-379.26088946956276,-7941.322469567927]
12 [ 9.9510542 18.0013183] [-252.0676277457317,-7200.484444176068]
13 [ 9.96779657 18.65419465] [-167.42368279692852,-6528.76359870868]
14 [ 9.9789071  19.24616564] [-111.105256

KeyboardInterrupt: 