# 基础分布式算法

### 1. 准备工作

配置和启动 PySpark：

In [2]:
import findspark
findspark.init("/Users/xinby/Library/Spark")

from pyspark.sql import SparkSession
# 本地模式
spark = SparkSession.builder.\
    master("local[*]").\
    appName("PySpark RDD").\
    getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7fa4c8f900d0>
<SparkContext master=local[*] appName=PySpark RDD>


利用 Numpy 创建一个矩阵，并写入文件：

In [4]:
import numpy as np
np.set_printoptions(linewidth=100)

np.random.seed(123)
n = 100
p = 5
mat = np.random.normal(size=(n, p))
np.savetxt("mat_np.txt", mat, fmt="%f", delimiter="\t")

PySpark 读取文件并进行一些简单操作：

In [5]:
file = sc.textFile("mat_np.txt")

# 打印矩阵行数
print(file.count())

# 空行
print()

# 打印前5行
text = file.take(5)
print(*text, sep="\n")

[Stage 0:>                                                          (0 + 2) / 2]

100

-1.085631	0.997345	0.282978	-1.506295	-0.578600
1.651437	-2.426679	-0.428913	1.265936	-0.866740
-0.678886	-0.094709	1.491390	-0.638902	-0.443982
-0.434351	2.205930	2.186786	1.004054	0.386186
0.737369	1.490732	-0.935834	1.175829	-1.253881


                                                                                

In [9]:
file.first()
print(type(file))
print(type(file.first()))

<class 'pyspark.rdd.RDD'>
<class 'str'>


23/03/30 20:19:45 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:593)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$1(BlockManagerMasterEndpoint.scala:592)
	at org.apache.spar

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 64377)
Traceback (most recent call last):
  File "/Users/xinby/opt/anaconda3/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/xinby/opt/anaconda3/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/xinby/opt/anaconda3/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/xinby/opt/anaconda3/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/xinby/Library/Spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/Users/xinby/Library/Spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/Users/xinby/Library/Spark/python/pyspark/accumulators.py", line 257

### 2. 进行分区映射（MapPartitions）

In [None]:
file_p10 = file.repartition(10)
print(file.getNumPartitions())
print(file_p10.getNumPartitions())

In [None]:
# str => np.array
def str_to_vec(line):
    # 分割字符串
    str_vec = line.split("\t")
    # 将每一个元素从字符串变成数值型
    num_vec = map(lambda s: float(s), str_vec)
    # 创建 Numpy 向量
    return np.fromiter(num_vec, dtype=float)

# Iter[str] => Iter[matrix]
def part_to_mat(iterator):
    # Iter[str] => Iter[np.array]
    iter_arr = map(str_to_vec, iterator)

    # Iter[np.array] => list(np.array)
    dat = list(iter_arr)

    # list(np.array) => matrix
    if len(dat) < 1:  # Test zero iterator
        mat = np.array([])
    else:
        mat = np.vstack(dat)

    # matrix => Iter[matrix]
    yield mat

In [None]:
dat = file_p10.mapPartitions(part_to_mat).filter(lambda x: x.shape[0] > 0)
print(dat.count())

### 3. 矩阵乘法 $Xv$

模拟数据和真实值：

In [None]:
np.random.seed(123)
v = np.random.uniform(size=p)
res = mat.dot(v)
res

每个 RDD 分区上进行计算：

In [None]:
res_part = dat.map(lambda x: x.dot(v)).collect()
res_part

拼接分块结果：

In [None]:
np.concatenate(res_part)

### 4. 矩阵乘法 $X'X$

真实值：

In [None]:
res = mat.transpose().dot(mat)
res

每个 RDD 分区上进行计算：

In [None]:
res = dat.map(lambda x: x.transpose().dot(x)).reduce(lambda x, y: x + y)
res

### 5. 矩阵乘法 $X'v$

以 `mat` 的前4列为 `X`，最后一列为 `v`：

In [None]:
X = mat[:, :-1]
v = mat[:, -1]
res = X.transpose().dot(v)
res

每个 RDD 分区上进行计算：

In [None]:
def Xitv(part):
    Xi = part[:, :-1]
    vi = part[:, -1]
    return Xi.transpose().dot(vi)

res = dat.map(Xitv).reduce(lambda x, y: x + y)
res

关闭 Spark 连接：

In [None]:
sc.stop()