# Data Type of MLLib

In [None]:
import os
import sys 

spark_home = os.environ['SPARK_HOME']
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))

from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [None]:
spark = (SparkSession.builder.master("local[2]")
         .appName("test")
         .getOrCreate()) 
sc = spark.sparkContext

## import Library

In [None]:
#!pip3.6 install numpy

In [None]:
import numpy as np
from pyspark.mllib.linalg import Vectors

## Dense vetcor

In [None]:
#使用list
x = [1,2,3,4,5]
dense_x = Vectors.dense(x)
print("dense_x = " + str(dense_x))

In [None]:
type(dense_x)

## Spars vector

In [None]:
#[1,0,0,0,3] => 5, {0:1, 4:3}

In [None]:
#3 ways to get Sparse vector

sparse_x = Vectors.sparse(5, {1: 1.0, 3: 5.5})
print("sparse_x = " + str(sparse_x))

sparse_y = Vectors.sparse(5, [(1, 1.0), (3, 5.5)])
print("sparse_y = " + str(sparse_y))

sparse_z = Vectors.sparse(5, [1, 3], [1.0, 5.5])
print("sparse_z = " + str(sparse_z))

## Sparse vector

In [None]:
sparse_x.toArray()

In [None]:
def print_sparse(x):
    for i in range(x.size):
        #avoid index error
        try:
            print(x[i])
        except IndexError: 
            print(0.0)

        
print_sparse(sparse_x)

In [None]:
sparse_x[2]

## methods of Vector 

In [None]:
# Dot Product
# https://github.com/apache/spark/blob/12206058e8780e202c208b92774df3773eff36ae/python/pyspark/mllib/linalg/__init__.py

In [None]:
a = [1, 2]
b = [3, 4]
c = [1, 1, 1]
dense_a = Vectors.dense(a)
dense_b = Vectors.dense(b)
dense_c = Vectors.dense(c)

In [None]:
# 1 * 3 + 2 * 4
dense_a.dot(dense_b)

In [None]:
dense_a.dot(dense_c)

In [None]:
# 平方和
dense_a.dot(dense_a)

In [None]:
# dense 和 sparse vector可以一起進行運算(注意維度要相同)
dense_x.dot(sparse_x)

In [None]:
# 計算距離
dense_x.squared_distance(sparse_y)

## DenseVector = numpy.ndarray

In [None]:
type(dense_x)

In [None]:
dense_x.reduce(lambda x, y : x + y)

In [None]:
## 要透過 spark context 轉成RDD
sc.parallelize(dense_x).reduce(lambda x, y : x + y)

In [None]:
# sparse vector 也要透過 spark context 轉成RDD
sc.parallelize(sparse_x).reduce(lambda x, y : x + y)

In [None]:
sc.parallelize(dense_x).sum()

In [None]:
sc.parallelize(sparse_x).sum()

## 將vector 以 Row為單位疊成 data set

In [None]:
data = [sparse_x, sparse_y, sparse_z]

In [None]:
data

In [None]:
# dataset 可以做統計計算
from pyspark.mllib.stat import Statistics
Statistics.colStats(sc.parallelize(data)).mean()

## Label Point

In [None]:
# 要注意 LabelPoint 和 Vector 來自不同的物件
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

### LabelPoint(label, [feature1, feature2, feature3])

In [None]:
data_label = [
     LabeledPoint(0.0, [0.0,1.0,1.0]),
     LabeledPoint(1.0, [1.0,1.0,2.0]),
     LabeledPoint(1.0, [2.0,3.0,2.0]),
     LabeledPoint(0.0, [3.0,2.0,5.0])
    ]

In [None]:
data_label

# Read Data From CSV File

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics


In [None]:
!hadoop fs -mkdir /data
!hadoop fs -put ../data/ratings.csv /data/

In [None]:
data = spark.read.csv("/data/ratings.csv", header= True)

In [None]:
data.show()

In [None]:
sample_data = data.sample(False, 0.001, 1)

In [None]:
sample_data.count()

In [None]:
type(sample_data)

In [None]:
dense_data = sample_data.rdd.map(lambda x: Vectors.dense(x))

In [None]:
dense_data.take(5)

In [None]:
Statistics.colStats(dense_data).mean()