# 基础分布式算法

### 1. 准备工作

配置和启动 PySpark：

In [2]:
import findspark
findspark.init("/Users/xinby/Library/Spark")

from pyspark.sql import SparkSession
# 本地模式
spark = SparkSession.builder.\
    master("local[*]").\
    appName("PySpark RDD").\
    getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7fcbf090ed60>
<SparkContext master=local[*] appName=PySpark RDD>


利用 Numpy 创建一个矩阵，并写入文件：

In [3]:
import numpy as np
np.set_printoptions(linewidth=100)

np.random.seed(123)
n = 100
p = 5
mat = np.random.normal(size=(n, p))
np.savetxt("mat_np.txt", mat, fmt="%f", delimiter="\t")

PySpark 读取文件并进行一些简单操作：

In [4]:
file = sc.textFile("mat_np.txt")

# 打印矩阵行数
print(file.count())

# 空行
print()

# 打印前5行
text = file.take(5)
print(*text, sep="\n")

[Stage 0:>                                                          (0 + 2) / 2]

100

-1.085631	0.997345	0.282978	-1.506295	-0.578600
1.651437	-2.426679	-0.428913	1.265936	-0.866740
-0.678886	-0.094709	1.491390	-0.638902	-0.443982
-0.434351	2.205930	2.186786	1.004054	0.386186
0.737369	1.490732	-0.935834	1.175829	-1.253881


                                                                                

In [5]:
file.first()
print(type(file))
print(type(file.first()))

<class 'pyspark.rdd.RDD'>
<class 'str'>


### 2. 进行分区映射（MapPartitions）

In [6]:
file_p10 = file.repartition(10)
print(file.getNumPartitions())
print(file_p10.getNumPartitions())

2
10


In [7]:
# str => np.array
def str_to_vec(line):
    # 分割字符串
    str_vec = line.split("\t")
    # 将每一个元素从字符串变成数值型
    num_vec = map(lambda s: float(s), str_vec)
    # 创建 Numpy 向量
    return np.fromiter(num_vec, dtype=float)

# Iter[str] => Iter[matrix]
def part_to_mat(iterator):
    # Iter[str] => Iter[np.array]
    iter_arr = map(str_to_vec, iterator)

    # Iter[np.array] => list(np.array)
    dat = list(iter_arr)

    # list(np.array) => matrix
    if len(dat) < 1:  # Test zero iterator
        mat = np.array([])
    else:
        mat = np.vstack(dat)

    # matrix => Iter[matrix]
    yield mat

In [8]:
dat = file_p10.mapPartitions(part_to_mat).filter(lambda x: x.shape[0] > 0)
print(dat.count())

7


### 3. 矩阵乘法 $Xv$

假设：$ X \in \R^{n\times p}, v \in \R^{p}$
- 将$X$按照行进行分块（包含所有列，但行不一定是一行），记为：$X=[X_1;...;X_m]^T$, $X_i \in \R^{n_i\times p}$
- 故$Xv=[X_1v;...;X_mv]^T$
- 这样$X$的一个分块$X_i$就是一个partition

模拟数据和真实值：(核对用)

In [9]:
np.random.seed(123)
v = np.random.uniform(size=p)
res = mat.dot(v)
res

array([-1.65326187,  0.43284335, -0.83326669,  1.65616556,  0.47393998, -1.20594195, -1.09926452,
       -0.24483357, -0.58399139,  2.91984625, -1.22159268,  2.99167578,  0.04907967,  0.00526486,
       -1.78033411, -1.03704672,  1.27253333,  0.0280204 ,  0.88785436,  0.03485989,  1.45756374,
       -1.26733834,  0.89596346, -0.65027554,  1.24724097,  0.01338995, -0.45613812,  1.06057634,
        0.33513133,  0.30420446, -1.8306843 ,  0.81135409,  0.8563569 , -0.59189289, -0.58993733,
        0.85925493,  0.20665867, -2.07373852,  0.23232788, -2.69748055,  1.19285523, -0.22831252,
       -0.75495708,  1.04599886, -0.59922216, -2.14049979, -0.68492854,  0.13322705,  0.11576237,
       -1.07628496,  0.98308603,  2.28403745,  0.31327103,  0.97450293, -2.19087869, -1.38414598,
       -2.06428815, -1.19693787, -2.20837322,  1.79393849,  0.37940968,  0.98364566,  2.12782768,
        0.17228872, -1.42418937, -0.66160026,  0.20736396, -0.42352417, -1.83096405,  0.75557361,
       -1.87660221, 

每个 RDD 分区上进行计算：

In [12]:
# mapPartition 的目的是将一个分区转换成了一个RDD的一个元素，因此以后的操作只要是map即可
# 这里不是reduce是因为不是要把最后分块进行加和，而是想要最后的拼接
res_part = dat.map(lambda x: x.dot(v)).collect() 
print(res_part,type(res_part))
# 这个结果里，一个array就是前面的一个分块

[array([-1.65326236,  0.43284381, -0.83326654,  1.65616548,  0.47393997, -1.20594265, -1.09926439,
       -0.24483374, -0.58399159,  2.91984624]), array([-1.22159275,  2.99167581,  0.04907979,  0.0052652 , -1.78033393, -1.03704719,  1.27253296,
        0.02802034,  0.88785453,  0.03485997]), array([ 1.45756404, -1.26733862,  0.89596327, -0.65027561,  1.24724115,  0.01338989, -0.45613776,
        1.06057673,  0.33513193,  0.30420455,  2.28403732,  0.31327091,  0.97450361, -2.19087935,
       -1.38414658, -2.06428804, -1.19693768, -2.20837397,  1.79393855,  0.37941031]), array([-1.8306849 ,  0.81135346,  0.85635656, -0.59189308, -0.58993783,  0.8592545 ,  0.20665878,
       -2.07373867,  0.23232755, -2.69748044,  0.9836457 ,  2.12782845,  0.17228866, -1.42418964,
       -0.66160031,  0.20736295, -0.4235236 , -1.83096434,  0.75557361, -1.87660252]), array([ 1.19285543, -0.22831212, -0.75495698,  1.04599886, -0.59922233, -2.14049959, -0.68492885,
        0.13322687,  0.11576229, -1.0762844

拼接分块结果：

In [13]:
np.concatenate(res_part)

array([-1.65326236,  0.43284381, -0.83326654,  1.65616548,  0.47393997, -1.20594265, -1.09926439,
       -0.24483374, -0.58399159,  2.91984624, -1.22159275,  2.99167581,  0.04907979,  0.0052652 ,
       -1.78033393, -1.03704719,  1.27253296,  0.02802034,  0.88785453,  0.03485997,  1.45756404,
       -1.26733862,  0.89596327, -0.65027561,  1.24724115,  0.01338989, -0.45613776,  1.06057673,
        0.33513193,  0.30420455,  2.28403732,  0.31327091,  0.97450361, -2.19087935, -1.38414658,
       -2.06428804, -1.19693768, -2.20837397,  1.79393855,  0.37941031, -1.8306849 ,  0.81135346,
        0.85635656, -0.59189308, -0.58993783,  0.8592545 ,  0.20665878, -2.07373867,  0.23232755,
       -2.69748044,  0.9836457 ,  2.12782845,  0.17228866, -1.42418964, -0.66160031,  0.20736295,
       -0.4235236 , -1.83096434,  0.75557361, -1.87660252,  1.19285543, -0.22831212, -0.75495698,
        1.04599886, -0.59922233, -2.14049959, -0.68492885,  0.13322687,  0.11576229, -1.07628444,
       -1.93437101, 

### 4. 矩阵乘法 $X'X$

- 同理进行分块，则最终的结果为：$X'X=X_1'X_1+\dots+X_m'X_m$ (由于这里的假定$X\in\R^{n\times p}, n>>p$，故最终的大小为$p\times p$，是可以存储在内存中的)
- 对于每个结果$X_i'X_i$ ，其大小也是$p\times p$，故也是内存友好的
- 这里由于是相加，因此需要`reduce`，而且参与运算的元素都是一个矩阵

真实值：

In [14]:
res = mat.transpose().dot(mat)
res

array([[ 9.35643395e+01, -2.39739879e-02,  4.75846887e+00,  2.27729644e+01,  5.35952824e+00],
       [-2.39739879e-02,  1.09769750e+02,  2.74564778e+00, -3.29329848e-01,  1.11698743e+01],
       [ 4.75846887e+00,  2.74564778e+00,  1.09077973e+02,  6.41825678e+00, -7.53446301e+00],
       [ 2.27729644e+01, -3.29329848e-01,  6.41825678e+00,  9.95562632e+01,  7.71274621e+00],
       [ 5.35952824e+00,  1.11698743e+01, -7.53446301e+00,  7.71274621e+00,  9.19863380e+01]])

每个 RDD 分区上进行计算：

In [16]:
res = dat.map(lambda x: x.T.dot(x)).reduce(lambda x, y: x + y) # x代表了前面的累积结果，y代表了最新一项
res

array([[ 9.35643453e+01, -2.39794314e-02,  4.75847395e+00,  2.27729716e+01,  5.35953520e+00],
       [-2.39794314e-02,  1.09769741e+02,  2.74564629e+00, -3.29332977e-01,  1.11698788e+01],
       [ 4.75847395e+00,  2.74564629e+00,  1.09077969e+02,  6.41825316e+00, -7.53445385e+00],
       [ 2.27729716e+01, -3.29332977e-01,  6.41825316e+00,  9.95562607e+01,  7.71275158e+00],
       [ 5.35953520e+00,  1.11698788e+01, -7.53445385e+00,  7.71275158e+00,  9.19863445e+01]])

### 5. 矩阵乘法 $X'v$
- 这里设定$X,v$的行数相同，故可以同时进行拆分(在具体操作时认为是在一个大矩阵中)
- $X'v = X_1'v_1+...+X_m'v_m$

以 `mat` 的前4列为 `X`，最后一列为 `v`：

In [17]:
X = mat[:, :-1]
v = mat[:, -1]
res = X.transpose().dot(v)
res

array([ 5.35952824, 11.1698743 , -7.53446301,  7.71274621])

每个 RDD 分区上进行计算：

In [18]:
def Xitv(part):
    '''
    在定义这个map函数的时候，由于整体是一个大矩阵，但是在内部需要将X和v区分开来
    '''
    Xi = part[:, :-1]
    vi = part[:, -1]
    return Xi.transpose().dot(vi)

res = dat.map(Xitv).reduce(lambda x, y: x + y)
res

array([ 5.3595352 , 11.16987882, -7.53445385,  7.71275158])

关闭 Spark 连接：

In [19]:
sc.stop()