In [4]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


In [5]:
spark

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [7]:
from pyspark.ml.linalg import *

In [8]:
v = DenseVector([1, 2, 3, 4])

In [9]:
type(v)

pyspark.ml.linalg.DenseVector

In [10]:
type(v[0])

numpy.float64

In [11]:
v.values

array([1., 2., 3., 4.])

In [12]:
type(v.values)

numpy.ndarray

In [13]:
v.toArray()

array([1., 2., 3., 4.])

In [14]:
type(v.toArray())

numpy.ndarray

## Indexing

In [15]:
v[0]

1.0

In [16]:
v[-1]

4.0

In [17]:
v[2:4]

array([3., 4.])

In [18]:
v[::-1]

array([4., 3., 2., 1.])

## Operations

In [19]:
v - 2

DenseVector([-1.0, 0.0, 1.0, 2.0])

In [20]:
v / 3

DenseVector([0.3333, 0.6667, 1.0, 1.3333])

## L1 norm

In [21]:
v.norm(1)

10.0

In [22]:
(v * -1).norm(1) == v.norm(1)

True

## L2 norm

In [23]:
v.norm(2)

5.477225575051661

In [24]:
v.norm(0)

4.0

In [25]:
u = Vectors.dense([1, 2, 3, 5])

In [26]:
u

DenseVector([1.0, 2.0, 3.0, 5.0])

In [27]:
v - u

DenseVector([0.0, 0.0, 0.0, -1.0])

In [28]:
v.squared_distance(u)

1.0

## Cosine similarity

In [29]:
v.dot(u) / (v.norm(2) * u.norm(2))

0.9939990885479664

## Sparse vectors

In [30]:
ndx_value = tuple(zip(range(4), range(1, 5)))

In [31]:
ndx_value

((0, 1), (1, 2), (2, 3), (3, 4))

In [32]:
v = SparseVector(len(ndx_value), ndx_value)

In [33]:
v

SparseVector(4, {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0})

In [34]:
u = Vectors.sparse(4, range(4), [1, 2, 3, 5])

In [35]:
u

SparseVector(4, {0: 1.0, 1: 2.0, 2: 3.0, 3: 5.0})

In [36]:
v * 2

TypeError: unsupported operand type(s) for *: 'SparseVector' and 'int'

In [37]:
v - u

TypeError: unsupported operand type(s) for -: 'SparseVector' and 'SparseVector'

In [38]:
v.squared_distance(u)

1.0

In [39]:
v.dot(u) / (v.norm(2) * u.norm(2))

0.9939990885479664

## Allright all this stuff is fun, but Spark is a distributed framework right?

In [67]:
from pyspark.mllib.linalg.distributed import *
from pyspark.mllib.linalg import *

In [114]:
lm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])

In [70]:
lm1

DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], False)

## Note that Matrix is column-major matrix!

In [120]:
lm1[0, 0], lm1[0, 1]

(1.0, 4.0)

In [71]:
lm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])

In [72]:
lm2

DenseMatrix(3, 2, [7.0, 8.0, 9.0, 10.0, 11.0, 12.0], False)

In [73]:
lsm = Matrices.sparse(3, 2, [0, 1, 2], [0, 1, 2], [7, 11, 12])

In [74]:
lsm

SparseMatrix(3, 2, [0, 1, 2], [0, 1, 2], [7.0, 11.0, 12.0], False)

In [75]:
lm1 + lm2

TypeError: unsupported operand type(s) for +: 'DenseMatrix' and 'DenseMatrix'

In [76]:
blocks1 = sc.parallelize([((0, 0), lm1), ((1, 0), lm2)])

In [81]:
blocks2 = sc.parallelize([((0, 0), lsm), ((1, 0), lm2)])

In [77]:
blocks1.collect()

[((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0)),
 ((1, 0), DenseMatrix(3, 2, [7.0, 8.0, 9.0, 10.0, 11.0, 12.0], 0))]

In [84]:
blocks2.collect()

[((0, 0), SparseMatrix(3, 2, [0, 1, 2], [0, 1, 2], [7.0, 11.0, 12.0], 0)),
 ((1, 0), DenseMatrix(3, 2, [7.0, 8.0, 9.0, 10.0, 11.0, 12.0], 0))]

In [79]:
dm1 = BlockMatrix(blocks1, 3, 2)

In [86]:
dm2 = BlockMatrix(blocks1, 3, 2)

In [121]:
dm1.toLocalMatrix()

DenseMatrix(6, 2, [1.0, 2.0, 3.0, 7.0, 8.0, 9.0, 4.0, 5.0, 6.0, 10.0, 11.0, 12.0], 0)

In [122]:
dm1.add(dm2).toLocalMatrix()

DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0)

In [123]:
dm2.transpose().toLocalMatrix()

DenseMatrix(2, 6, [1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 7.0, 10.0, 8.0, 11.0, 9.0, 12.0], 0)

In [124]:
dm1.multiply(dm2.transpose()).toLocalMatrix()

DenseMatrix(6, 6, [17.0, 22.0, 27.0, 47.0, 52.0, 57.0, 22.0, 29.0, ..., 185.0, 204.0, 57.0, 78.0, 99.0, 183.0, 204.0, 225.0], 0)

In [125]:
indexed_dm1 = dm1.toIndexedRowMatrix()

In [135]:
svd = indexed_dm1.computeSVD(2)

## But, you can compute SVD!

In [137]:
svd.V

DenseMatrix(2, 2, [-0.5633, -0.8263, -0.8263, 0.5633], 0)

In [138]:
svd.s

DenseVector([25.3997, 2.2033])