# 分布式回归模型

### 1. 准备工作

配置和启动 PySpark：

In [3]:
import findspark
findspark.init("/Users/xinby/Library/Spark")

from pyspark.sql import SparkSession
# 本地模式
spark = SparkSession.builder.\
    master("local[*]").\
    appName("PySpark RDD").\
    getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7f9098958e20>
<SparkContext master=local[*] appName=PySpark RDD>


利用 Numpy 生成模拟数据，并写入文件。首先生成 $n\gg p$ 的数据：

In [11]:
import os
import numpy as np
from scipy.special import expit, logit
np.set_printoptions(linewidth=100)

np.random.seed(123)
n = 100000
p = 100
x1 = np.random.normal(size=(n, p))
beta1 = np.random.normal(size=p)
y1 = x1.dot(beta1) + np.random.normal(scale=0.1, size=n)
dat = np.hstack((y1.reshape(n, 1), x1))
np.savetxt("data/reg_tall.txt", dat, fmt="%f", delimiter="\t")

以及 $n<p$ 的数据：

In [15]:
n = 500
p = 5000
x2 = np.random.normal(size=(n, p))
beta2 = np.random.normal(size=p)
beta2[10:] = 0.0
y2 = x2.dot(beta2) + np.random.normal(scale=0.1, size=n)
dat = np.hstack((y2.reshape(n, 1), x2))
if not os.path.exists("dat"):
    os.makedirs("dat", exist_ok=True)
np.savetxt("dat/reg_wide.txt", dat, fmt="%f", delimiter="\t")

PySpark 读取文件并进行一些简单操作：

In [16]:
file1 = sc.textFile("data/reg_tall.txt")

# 打印矩阵行数
print(file1.count())

# 空行
print()

# 打印前5行，并将每行字符串截尾
text = file1.map(lambda x: x[:70] + "...").take(5)
print(*text, sep="\n")

                                                                                

100000

-0.492572	-1.085631	0.997345	0.282978	-1.506295	-0.578600	1.651437	-2....
11.643889	0.642055	-1.977888	0.712265	2.598304	-0.024626	0.034142	0.17...
-11.441109	0.703310	-0.598105	2.200702	0.688297	-0.006307	-0.206662	-0...
0.618083	0.765055	-0.828989	-0.659151	0.611124	-0.144013	1.316606	-0.7...
-8.438569	1.534090	-0.529914	-0.490972	-1.309165	-0.008660	0.976813	-1...


In [19]:
file2 = sc.textFile("dat/reg_wide.txt")

# 打印矩阵行数
print(file2.count())

# 空行
print()

# 打印前5行，并将每行字符串截尾
text = file2.map(lambda x: x[:70] + "...").take(5)
print(*text, sep="\n")

500

-5.682478	-0.942972	1.101249	0.462247	1.822374	1.970292	0.360064	1.532...
-2.807761	0.078059	0.428170	2.697175	0.283865	-0.731593	-0.401084	-0.8...
4.174590	0.357621	-0.013570	-2.141000	-1.364680	-2.109942	0.467017	0.6...
-1.325983	-3.099652	1.617915	-0.085332	-0.024646	0.780907	-0.597756	0....
-2.205896	-1.147221	0.303183	1.318105	0.848772	-1.042922	0.058829	0.07...


### 2. $n\gg p$

回归系数估计值的显式解为 $\hat{\beta}=(X'X)^{-1}X'y$。当 $n\gg p$ 且 $p$ 不太大时，$X'X$ 为 $p\times p$ 矩阵，$X'y$ 为 $p\times 1$ 向量，均可放入内存。因此，此时问题的核心在于计算 $X'X$ 与 $X'y$。

- 要始终保证$X,y$在同一个RDD中
- 将$X,y$作为一个整体进行RDD partition

首先进行分区映射：

In [7]:
file_p10 = file1.repartition(10)
print(file_p10.getNumPartitions())

10


In [8]:
# str => np.array
def str_to_vec(line):
    # 分割字符串
    str_vec = line.split("\t")
    # 将每一个元素从字符串变成数值型
    num_vec = map(lambda s: float(s), str_vec)
    # 创建 Numpy 向量
    return np.fromiter(num_vec, dtype=float)

# Iter[str] => Iter[matrix]
def part_to_mat(iterator):
    # Iter[str] => Iter[np.array]
    iter_arr = map(str_to_vec, iterator)

    # Iter[np.array] => list(np.array)
    dat = list(iter_arr)

    # list(np.array) => matrix
    if len(dat) < 1:  # Test zero iterator
        mat = np.array([])
    else:
        mat = np.vstack(dat)

    # matrix => Iter[matrix]
    yield mat

In [9]:
dat = file_p10.mapPartitions(part_to_mat).filter(lambda x: x.shape[0] > 0)
print(dat.count())



10


                                                                                

In [10]:
dat.first()

array([[13.881828,  1.131538, -0.32151 , ...,  1.456532,  1.046854, -0.409166],
       [12.336276,  1.966215,  0.938504, ..., -0.456539, -0.139554, -0.692508],
       [-1.703982, -0.501628,  0.047969, ..., -0.496291, -0.645332, -1.267813],
       ...,
       [23.281892,  0.14101 ,  0.80979 , ...,  0.987011,  2.248003, -0.740502],
       [ 0.931458,  0.837843, -0.610115, ...,  0.087622, -0.270009, -1.254825],
       [10.651135,  0.346765, -0.522064, ..., -0.528144,  0.084826,  0.922815]])

注意此时每个分区上的数据同时包含了因变量(y在第一列)和自变量，在使用自变量时，要将第一列排除。计算 $X'X$：

In [11]:
xtx = dat.map(lambda part: part[:, 1:].T.dot(part[:, 1:])).reduce(lambda x, y: x + y)
xtx

                                                                                

array([[ 9.92814274e+04,  5.64799792e+02,  4.68363424e+01, ...,  1.31821863e+02, -1.81293885e+01,
         2.77125486e+02],
       [ 5.64799792e+02,  1.00090223e+05,  4.34513678e+02, ...,  3.22355415e+02,  1.98867239e+02,
         9.69607877e+01],
       [ 4.68363424e+01,  4.34513678e+02,  9.92600734e+04, ..., -2.01876920e+02, -6.97509673e+02,
         2.97558656e+02],
       ...,
       [ 1.31821863e+02,  3.22355415e+02, -2.01876920e+02, ...,  9.98741108e+04,  6.09301190e+01,
         1.44257513e+02],
       [-1.81293885e+01,  1.98867239e+02, -6.97509673e+02, ...,  6.09301190e+01,  9.99329830e+04,
        -2.53690380e+02],
       [ 2.77125486e+02,  9.69607877e+01,  2.97558656e+02, ...,  1.44257513e+02, -2.53690380e+02,
         9.95605285e+04]])

计算 $X'y$：

In [12]:
xty = dat.map(lambda part: part[:, 1:].transpose().dot(part[:, 0])).reduce(lambda x, y: x + y)
xty

                                                                                

array([-5.72523147e+04, -1.13445442e+05,  1.14296626e+05,  6.70903227e+04,  6.40148536e+04,
       -1.73288654e+05,  9.65899313e+04, -7.04664986e+04,  1.17713274e+05, -1.24568031e+04,
       -5.50355360e+04,  4.50494339e+04,  1.53412736e+05,  9.03011121e+04,  3.68938036e+04,
       -6.74204369e+04, -1.38999203e+04,  1.03492109e+05,  1.00816430e+05, -2.49288673e+04,
        2.76828665e+04, -1.96876397e+05,  9.23669947e+04, -4.36091041e+04, -1.29989914e+05,
       -1.15080876e+05, -9.68461454e+04, -4.38833665e+04, -3.92874942e+04,  7.50460497e+03,
        6.75428356e+04,  4.49467215e+04,  7.90760279e+04,  1.28488448e+04, -1.64995344e+04,
        1.29426796e+05, -8.84459583e+04, -1.54233257e+05,  1.23672995e+03,  5.52665865e+03,
        1.74929996e+02, -3.51956381e+04, -1.75937385e+05, -1.33574238e+05, -1.57921700e+05,
       -1.29883192e+05,  9.35018217e+04,  8.71584103e+04, -9.00909788e+04, -1.11506523e+05,
       -6.16019121e+04, -1.57272495e+05, -1.46453819e+05, -1.40527176e+05,  3.13

**Note: 这两步的操作xtx,xty导致数据读取了两遍，造成了大量的通信成本，因此两步操作可以合并，就是直接以这个matrix进行操作**

**请实现！！！！！！！**

此时剩下的操作即为求解线性方程组。由于 $p$ 较小，故可以在内存中完成：

In [13]:
bhat = np.linalg.solve(xtx, xty)
bhat

array([-0.58422145, -1.11605591,  1.1559347 ,  0.68617539,  0.64534766, -1.70930592,  0.87296172,
       -0.69022365,  1.21031103, -0.18000063, -0.59629703,  0.45240243,  1.55780574,  0.93400416,
        0.33550102, -0.62756266, -0.16682495,  1.03999291,  0.99266921, -0.2220117 ,  0.26884866,
       -1.9555697 ,  0.93141768, -0.46989397, -1.3011378 , -1.08472   , -0.92674644, -0.46976713,
       -0.41262747,  0.09672668,  0.73804542,  0.43592289,  0.78154368,  0.09788819, -0.20673303,
        1.35677125, -0.84339913, -1.57384018, -0.02816233,  0.04550477, -0.00426462, -0.32000893,
       -1.73697639, -1.35751444, -1.61142709, -1.29011551,  0.92229365,  0.92287512, -0.87182444,
       -1.11120148, -0.64178182, -1.54097709, -1.47574519, -1.40014092,  0.05651123, -2.06681374,
        0.2406474 , -1.45930317, -0.4989418 , -1.08579381,  1.22881498,  0.71939479,  0.4744752 ,
       -0.21579098,  1.19156816, -0.18989885,  0.42140091,  0.48960889, -0.28646643, -0.92767184,
       -2.54505269, 

与真值进行对比：

In [20]:
beta1

array([-0.58441387, -1.11534971,  1.15570434,  0.68635474,  0.64559564, -1.70989894,  0.87296263,
       -0.69061605,  1.21015702, -0.17972039, -0.59593691,  0.45252029,  1.55835773,  0.93393225,
        0.33544231, -0.62751865, -0.16601382,  1.03999001,  0.99253053, -0.22189606,  0.26883567,
       -1.95555529,  0.93138627, -0.47006585, -1.30103516, -1.0852571 , -0.92710125, -0.46948194,
       -0.41250335,  0.09711287,  0.73781056,  0.43563306,  0.78146188,  0.09794209, -0.20673932,
        1.3563857 , -0.84334031, -1.57440936, -0.02785942,  0.04491426, -0.00378645, -0.32005209,
       -1.73699327, -1.35755085, -1.61153494, -1.29018054,  0.92198227,  0.92329806, -0.87182087,
       -1.11109253, -0.64117813, -1.54063266, -1.47551246, -1.40012474,  0.05626203, -2.06673395,
        0.24069911, -1.45962908, -0.49910593, -1.08548   ,  1.22895895,  0.71943325,  0.47494861,
       -0.21579343,  1.19116701, -0.19017216,  0.4216233 ,  0.48972018, -0.2864368 , -0.92757763,
       -2.54487881, 

**思考题**：实际计算回归时，我们一般会加入截距项。此时应该如何修改程序，使其可以输出包含截距项的回归系数？

$$X^* = [1,X]$$


### 3. $n<p$ 岭回归

- 当 $n<p$ 时，$X'X$ 不可逆，此时最小二乘**没有唯一解** （事实上是有无数组解使得$\min S_c=0$，即可以完美拟合）【并不是说不存在解！！】
- 此时的OLS并不影响预测性，但是并不影响解释性

首先获取维度信息：

In [None]:
n = file2.count()
n

In [None]:
p = str_to_vec(file2.first()).shape[0] - 1
p

然后创建分区 RDD：

In [None]:
dat = file2.repartition(10).mapPartitions(part_to_mat).filter(lambda x: x.shape[0] > 0)
print(dat.count())

当 $n<p$ 时，$X'X$ 不再可逆，因此最小二乘估计没有唯一解。此时我们可以采用岭回归的方法，其在最小二乘损失函数的基础上加入一个惩罚项 $\lambda \Vert\beta\Vert^2$。岭回归估计的显式解为 $\hat{\beta}_\lambda=(X'X+\lambda I)^{-1}X'y$，其中 $\lambda>0$ 是一个给定的正数。(需要调参进行选取)

$$Loss = ||Y-X\beta||_2+\lambda ||\beta||^2$$

此时的显式解为：

$$\hat\beta_\lambda = (X^TX+\lambda I)^{-1}X^TY$$

一定存在解（试证）


但注意到 $X'X+\lambda I$ 是一个高维的矩阵($p\times p$)，难以直接进行求解。因此我们采用共轭梯度法（参见 [lec7-cg.ipynb](lec7-cg.ipynb)）：

In [None]:
def cg(Afn, b, x0, eps=1e-3, print_progress=False, **Afn_args):
    m = b.shape[0]
    # 初始解（注意此处应该复制x0，否则程序退出时会修改x0）
    x = np.copy(x0)
    # 初始残差向量
    r = b - Afn(x, **Afn_args)
    # 初始共轭梯度
    p = r

    for k in range(m):
        # 矩阵乘法
        Ap = Afn(p, **Afn_args)
        rr = r.dot(r)
        alpha = rr / p.dot(Ap)
        # 更新解
        x += alpha * p
        # 计算新残差向量
        rnew = r - alpha * Ap
        # 测试是否收敛
        norm = np.linalg.norm(rnew)
        if print_progress:
            print(f"Iter {k}, residual norm = {norm}")
        if norm < eps:
            break
        beta = rnew.dot(rnew) / rr
        # 更新共轭梯度
        p = rnew + beta * p
        # 更新残差向量
        r = rnew

    return x

先计算 $b=X'y$：

In [None]:
b = dat.map(lambda part: part[:, 1:].transpose().dot(part[:, 0])).reduce(lambda x, y: x + y)
b

我们需要定义一个函数计算 $(X'X+\lambda I)v=X'Xv+\lambda v$，其中第一项可以分布式进行（参见笔记）。

In [None]:
def xtxv(part, v):
    '''该函数是非分布式,其含义只是单纯输入一个X,v,输出X.TX.v
    part: 从第二列开始是X的数据(第一列为y)
    '''
    x = part[:, 1:]
    return x.transpose().dot(x.dot(v))

def ridge_prod(v, lam, rdd):
    '''计算X'Xv+lam*v,其中X'Xv是分布式计算的
    v,lam: ridge回归的参数
    rdd: 每个rdd为数据的一个分块,形式上相当于一个分块矩阵,对于每个分块矩阵计算一个矩阵乘法,
         最终再用reduce加总起来
    ridge_prod: 返回X'Xv+lam*v
    '''
    first_term = rdd.map(lambda part: xtxv(part, v)).reduce(lambda x, y: x + y)
    second_term = lam * v
    return first_term + second_term

接下来调用 CG 函数，取 $\lambda=0.01 n$：

In [None]:
lam = 0.01 * n
sol = cg(ridge_prod, b, x0=np.zeros(shape=p), eps=1e-3, print_progress=True, lam=lam, rdd=dat)

In [None]:
sol[:30]

关闭 Spark 连接：

In [None]:
sc.stop()