## 聚类用于回归预测问题

### 1. 利用聚类结果的转移概率矩阵直接预测

In [2]:
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA, PCA, TruncatedSVD, RandomizedPCA
from sklearn.metrics import pairwise

from cluster.models import KMeans, MiniBatchKMeans, SpectralClustering, AffinityPropagation
from cluster.dataset import load_time_series
from cluster.visual import plot_cluster_sequence, plot_cluster_dim_reduction
from cluster.analysis import transition_probability_matrix

In [3]:
data = load_time_series(1)
data = data.groupby('datetime')['pwr'].sum()
data = pd.DataFrame(data.values.reshape(-1, 48), index=np.unique(pd.to_datetime(data.index).date.astype(str)))
kmeans_pp = KMeans(
    n_clusters=8,  # 聚类簇数
    max_iter=300,  # 最大迭代次数
    n_init=10,  # 随机初始化运行总次数，反馈最佳聚类结果
    init='k-means++',  # {'kmeans++', 'random', ndarray}
    algorithm='auto',  # {'auto', 'full': EM算法, 'elkan': 应用三角不等式，不支持sparse}
)
kmeans_pp.fit(data.values)
m=transition_probability_matrix(kmeans_pp.labels_[-100:])

In [56]:
def predict(x, k=5):
    kmeans_pp = KMeans(
    n_clusters=5,  # 聚类簇数
    max_iter=300,  # 最大迭代次数
    n_init=10,  # 随机初始化运行总次数，反馈最佳聚类结果
    init='k-means++',  # {'kmeans++', 'random', ndarray}
    algorithm='auto',  # {'auto', 'full': EM算法, 'elkan': 应用三角不等式，不支持sparse}
)
    kmeans_pp.fit(x)
    m = transition_probability_matrix(kmeans_pp.labels_)
    x_day = x.sum(axis=1)
    labels_mean = {
        label: np.median(x_day[np.where(kmeans_pp.labels_==label)]) for label in np.unique(kmeans_pp.labels_)
    }
    pred = sum(labels_mean[label] * p for label, p in m.iloc[kmeans_pp.labels_[-1]].items())
    return pred

In [68]:
def test(x, k=5, init=100, window=100):
    res = []
    for i in range(init, x.shape[0]):
        res.append(predict(x[i-window:i, :], k))
    y = x.sum(axis=1)[init:]
    pred = np.array(res)
    mape = np.mean(np.abs(pred - y) / y)
    return mape

In [97]:
pred = test(data.values, k=5, init=350, window=30)

In [98]:
pred

0.09815483459150905

## 2. 将聚类标签作为特征输入

In [103]:
kmeans_pp.predict(data.values)

array([2, 6, 2, 2, 2, 6, 6, 6, 6, 2, 2, 6, 2, 2, 2, 6, 2, 2, 6, 6, 6, 6,
       6, 2, 2, 6, 6, 6, 6, 0, 2, 2, 0, 6, 6, 6, 6, 2, 2, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 2,
       6, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2,
       0, 6, 6, 2, 6, 2, 2, 6, 2, 6, 6, 6, 2, 5, 6, 6, 6, 2, 2, 2, 2, 6,
       6, 6, 6, 6, 2, 2, 6, 2, 6, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 5, 2,
       1, 5, 5, 4, 7, 5, 5, 1, 5, 5, 7, 4, 1, 5, 5, 1, 5, 4, 7, 5, 1, 5,
       1, 5, 4, 7, 5, 5, 5, 1, 5, 4, 7, 5, 1, 5, 1, 5, 7, 7, 5, 1, 1, 1,
       2, 7, 7, 1, 1, 2, 1, 1, 7, 7, 2, 1, 1, 1, 1, 4, 7, 1, 1, 5, 1, 1,
       7, 7, 2, 1, 1, 1, 1, 7, 4, 1, 1, 2, 1, 1, 7, 4, 7, 1, 5, 1, 1, 7,
       4, 1, 1, 1, 1, 1, 7, 4, 1, 1, 5, 1, 5, 7, 7, 5, 1, 5, 1, 5, 7, 7,
       5, 1, 1, 7, 1, 7, 7, 1, 1, 1, 1, 1, 7, 1, 1, 1, 7, 4, 7, 7, 4, 7,
       7, 7, 1, 1, 1, 7, 5, 1, 5, 1, 5, 7, 4, 1, 5, 1, 1, 5, 7, 4, 1, 5,
       1, 5, 5, 4, 4, 1, 1, 5, 1, 5, 7, 4, 5, 1, 5,