# Data Type of MLLib

In [1]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [2]:
# 启动 Spark （如果你已经启动就不需要）
spark = (SparkSession.builder.master("local[2]")
         .appName("test")
         .getOrCreate()) 
sc = spark.sparkContext

## import Library

In [3]:
import numpy as np
from pyspark.mllib.linalg import Vectors

## Dense vetcor

In [15]:
#使用list
x = [1,2,3,4,5]
dense_x = Vectors.dense(x)
print("dense_x = " + str(dense_x))

dense_x = [1.0,2.0,3.0,4.0,5.0]


In [5]:
type(dense_x)

pyspark.mllib.linalg.DenseVector

## Sparse vector

[1,0,0,0,3] => 5, {0:1, 4:3}

In [7]:
#三种产生Sparse vector的方法

sparse_x = Vectors.sparse(5, {1: 1.0, 3: 5.5})
print("sparse_x = " + str(sparse_x))

sparse_y = Vectors.sparse(5, [(1, 1.0), (3, 5.5)])
print("sparse_y = " + str(sparse_y))

sparse_z = Vectors.sparse(5, [1, 3], [1.0, 5.5])
print("sparse_z = " + str(sparse_z))

sparse_x = (5,[1,3],[1.0,5.5])
sparse_y = (5,[1,3],[1.0,5.5])
sparse_z = (5,[1,3],[1.0,5.5])


## 确认 Sparse vector

In [8]:
sparse_x.toArray()

array([0. , 1. , 0. , 5.5, 0. ])

In [12]:
def print_sparse(x):
    for i in range(x.size):
        #当saprse vector最后一位遇到缺值会因为省略而出现Index Error
        try:
            print(x[i])
        except IndexError: 
            print(0.0)

        
print_sparse(sparse_x)

0.0
1.0
0.0
5.5
0.0


In [13]:
sparse_x[2]

0.0

## Vector 方法

In [None]:
# Dot Product
# https://github.com/apache/spark/blob/12206058e8780e202c208b92774df3773eff36ae/python/pyspark/mllib/linalg/__init__.py

In [16]:
a = [1, 2]
b = [3, 4]
c = [1, 1, 1]
dense_a = Vectors.dense(a)
dense_b = Vectors.dense(b)
dense_c = Vectors.dense(c)

In [17]:
# 1 * 3 + 2 * 4
dense_a.dot(dense_b)

11.0

In [23]:
dense_a.dot(dense_c)  #维度要相同

AssertionError: dimension mismatch

In [19]:
# 平方和
dense_a.dot(dense_a)

5.0

In [20]:
# dense 和 sparse vector可以一起进行运算(注意维度要相同)
dense_x.dot(sparse_x)

24.0

In [21]:
# 计算距离
dense_x.squared_distance(sparse_y)

38.25

## DenseVector = numpy.ndarray

In [24]:
type(dense_x)

pyspark.mllib.linalg.DenseVector

In [25]:
dense_x.reduce(lambda x, y : x + y)

AttributeError: 'numpy.ndarray' object has no attribute 'reduce'

In [26]:
## 要透过 spark context 转成RDD
sc.parallelize(dense_x).reduce(lambda x, y : x + y)

15.0

In [27]:
# sparse vector 也要透过 spark context 转成RDD
sc.parallelize(sparse_x).reduce(lambda x, y : x + y)

6.5

In [28]:
sc.parallelize(dense_x).sum()

15.0

In [29]:
sc.parallelize(sparse_x).sum()

6.5

## 将vector 以 Row为单位叠成 data set

In [30]:
data = [sparse_x, sparse_y, sparse_z]

In [31]:
data

[SparseVector(5, {1: 1.0, 3: 5.5}),
 SparseVector(5, {1: 1.0, 3: 5.5}),
 SparseVector(5, {1: 1.0, 3: 5.5})]

In [32]:
# dataset 可以做统计计算
from pyspark.mllib.stat import Statistics
Statistics.colStats(sc.parallelize(data)).mean()

array([0. , 1. , 0. , 5.5, 0. ])

## Label Point

In [33]:
# 要注意 LabelPoint 和 Vector 来自不同的物件
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

### LabelPoint(label, [feature1, feature2, feature3])

In [34]:
data_label = [
     LabeledPoint(0.0, [0.0,1.0,1.0]),
     LabeledPoint(1.0, [1.0,1.0,2.0]),
     LabeledPoint(1.0, [2.0,3.0,2.0]),
     LabeledPoint(0.0, [3.0,2.0,5.0])
    ]

In [35]:
data_label

[LabeledPoint(0.0, [0.0,1.0,1.0]),
 LabeledPoint(1.0, [1.0,1.0,2.0]),
 LabeledPoint(1.0, [2.0,3.0,2.0]),
 LabeledPoint(0.0, [3.0,2.0,5.0])]

# Read Data From CSV File

In [36]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics


In [37]:
data = spark.read.csv("../data/ratings.csv", header= True)

In [38]:
data.show()

+------+-------+------+----------+
|userid|movieid|rating|        ts|
+------+-------+------+----------+
|     3|   6539|     5|1133571238|
|     3|   7153|     4|1133571171|
|     3|   7155|   3.5|1164885564|
|     3|   8529|     4|1136075616|
|     3|   8533|   4.5|1136418593|
|     3|   8783|     5|1136075857|
|     3|  27821|   4.5|1136418616|
|     3|  33750|   3.5|1164885688|
|     3|  33750|   3.5|1164887688|
|     3|    344|  null| 844416742|
|     4|     21|     3| 844416980|
|     4|     34|     5| 844416936|
|     4|     39|     3| 844417037|
|     4|    110|     5| 844416866|
|     4|    150|     5| 844416656|
|     4|    153|     5| 844416699|
|     4|    161|     5| 844416835|
|     4|    165|     5| 844416699|
|     4|    208|     3| 844416866|
|     4|    231|     1| 844416742|
+------+-------+------+----------+
only showing top 20 rows



In [45]:
sample_data = data.sample(False, 0.1, 1)   # 抽取10%的data

In [46]:
sample_data.show()

+------+-------+------+----------+
|userid|movieid|rating|        ts|
+------+-------+------+----------+
|     3|   7155|   3.5|1164885564|
|     3|  33750|   3.5|1164887688|
|     4|    153|     5| 844416699|
|     4|    349|     3| 844416699|
+------+-------+------+----------+



In [47]:
sample_data.count()

4

In [48]:
type(sample_data)

pyspark.sql.dataframe.DataFrame

In [50]:
dense_data = sample_data.rdd.map(lambda x: Vectors.dense(x))

In [51]:
dense_data.take(5)

[DenseVector([3.0, 7155.0, 3.5, 1164885564.0]),
 DenseVector([3.0, 33750.0, 3.5, 1164887688.0]),
 DenseVector([4.0, 153.0, 5.0, 844416699.0]),
 DenseVector([4.0, 349.0, 3.0, 844416699.0])]

In [52]:
Statistics.colStats(dense_data).mean()

array([3.50000000e+00, 1.03517500e+04, 3.75000000e+00, 1.00465166e+09])