In [287]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import classification_report

In [409]:
# 读取数据carEvaluation数据集
path = u'C:/Users/jxjsj/Desktop/JupyterHome/Data/carEvaluation.txt'
dataSet = pd.read_csv(path, header = None)
x_temp, y_temp = np.split(dataSet, (6,), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, random_state=1, train_size=0.8)



In [413]:
# 读取数据wine_quality
path = u'C:/Users/jxjsj/Desktop/JupyterHome/Data/winequality-white-test.csv'
dataSet = pd.read_csv(path, header = 0)
x_temp, y_temp = np.split(dataSet, (11,), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, random_state=1, train_size=0.8)



In [415]:
from sklearn import datasets
iris = datasets.load_iris()
x_temp = pd.DataFrame(iris.data, columns = ['Sepal.L','Sepal.W','Petal.L','Petal.W'])
y_temp = pd.DataFrame(iris.target, columns =['Class'])
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, random_state=1, train_size=0.7)



In [404]:
# 属性变量编码 方法1 好用！！，方便变回原数据进行展示
vec = DictVectorizer(sparse=False)
x_temp = vec.fit_transform(x_temp.to_dict(orient='record'))   #对训练数据的特征进行提取
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.fit_transform(x_test.to_dict(orient='record'))
vec.feature_names_

['0=high',
 '0=low',
 '0=med',
 '0=vhigh',
 '1=high',
 '1=low',
 '1=med',
 '1=vhigh',
 '2=2',
 '2=3',
 '2=4',
 '2=5more',
 '3=2',
 '3=4',
 '3=more',
 '4=big',
 '4=med',
 '4=small',
 '5=high',
 '5=low',
 '5=med']

In [53]:
carCluster = KMeans( n_clusters=3, 
                     init='k-means++', 
                     n_init=10, 
                     max_iter=300, 
                     tol=0.0001, 
                     precompute_distances='auto', 
                     verbose=0, 
                     random_state=1, 
                     copy_x=True, 
                     n_jobs=None, 
                     algorithm='auto')
carCluster.fit(x_temp)
metrics.adjusted_rand_score(carCluster.labels_, y_temp['Class'])

0.7302382722834697

In [84]:
carCluster = DBSCAN(eps=1.3, 
                    min_samples=2, 
                    metric='cityblock', 
                    metric_params=None, 
                    algorithm='auto', 
                    leaf_size=2, 
                    p=None, 
                    n_jobs=None)
carCluster.fit(x_temp)
metrics.adjusted_rand_score(carCluster.labels_, y_temp['Class'])

0.5681159420289855

In [110]:
carCluster = GaussianMixture(n_components=3, 
                             covariance_type='full', 
                             tol=0.001, 
                             reg_covar=1e-06, 
                             max_iter=100, 
                             n_init=1, 
                             init_params='kmeans', 
                             weights_init=None, 
                             means_init=None, 
                             precisions_init=None, 
                             random_state=1, 
                             warm_start=False, 
                             verbose=0, 
                             verbose_interval=10)
carCluster.fit(x_temp)
metrics.adjusted_rand_score(carCluster.predict(x_temp), y_temp['Class'])

0.9038742317748124

In [388]:
# 学习向量量化，个人撰写
import heapq
def LVQ_ML(X, Y, prototype_num = 10, leaning_rate = 0.1, max_iter = 200, balanced=False, X_test=None):
    '''
    X             ：训练数据的特征向量矩阵，DataFrame
    Y             ：训练数据的标签，一列DataFrame
    prototype_num ：原型向量的预设类别标记个数（通常大于标签类别个数）
    leaning_rate  ：原型向量的更新学习率
    max_iter      ：最大迭代轮数
    balanced      ：是否增大少量样本的抽取概率
    X_test        ：测试数据的特征向量矩阵，DataFrame
    output        ：prototype_v：原型向量
                    y_label_pro_v：原型向量对应的标签
                    x_cluster_lst：所有x的簇，即子类
                    x_fit_label：预测的x的标签
    较适用均衡数据，数值型特征向量，属性型标签
    '''
    x_temp = np.array(X)
    y_temp = np.array(Y).ravel()
    
    y_label_lst = list(set(y_temp.tolist()))
    y_label_num = len(y_label_lst)
    
    # 计算每个标签的子簇数量，平均分配（故适用平衡数据）
    prototype_num_i = prototype_num//y_label_num
    prototype_num_lst = [prototype_num_i for i in range(y_label_num)]
    
    for i in range(prototype_num-prototype_num_i*y_label_num):
        prototype_num_lst[i] += 1
    
    # 将每个标签对应的数据拆分为多个Array
    def cut_with_label(df, label):
        df_dct = {}
        labels_lst = list(set(df[label].tolist()))
        for yi in labels_lst:
            df_temp = df[df[label].isin([yi])]
            del df_temp[label]
            df_temp = np.array(df_temp)
            df_dct[yi] = df_temp
        n_list = []
        for key in df_dct.keys():
            n_list.append(len(df_dct[key]))
        return df_dct
    df_dct = cut_with_label(pd.concat([X,Y],axis=1),Y.columns[0])
    
    # 随机生成初始化原型向量
    y_label_pro_v = []
    for n in range(len(y_label_lst)):
        # 读取该标签对应的特征向量矩阵
        df_temp = df_dct[y_label_lst[n]]
        pro_num_for_yi = prototype_num_lst[n]
        pro_id_for_yi = np.random.choice(a=len(df_temp), size=pro_num_for_yi, replace=False, p=None)
        # 保存初始化原型向量的标签
        for cnt in range(pro_num_for_yi):
            y_label_pro_v.append(y_label_lst[n])
        # 合并原型向量
        try:
            prototype_v = np.concatenate((prototype_v,df_temp[list(pro_id_for_yi),:]),axis=0)
        except:
            prototype_v = df_temp[list(pro_id_for_yi),:]

    # 制作非平衡数据时使用的对应随机抽取概率，p=n/n_yi
    y_b = y_temp
    sum_n = len(y_b)
    labelCounts = {}
    for i in y_b:
        if i not in labelCounts.keys(): 
            labelCounts[i] = 0
        labelCounts[i] += 1
    for i in labelCounts.keys():
        labelCounts[i] = sum_n/labelCounts[i]
    prob_list_temp = [labelCounts[i] for i in y_temp]
    sum_prob = sum(prob_list_temp)
    prob_list = [i/sum_prob for i in prob_list_temp]
    
    # 开始迭代更新原型向量    
    while max_iter > 0:
        
        # 是否使用平衡数据概率
        if balanced:
            j = np.random.choice(len(x_temp), 1, p=prob_list)[0]
            
        else:
            j = np.random.choice(len(x_temp), 1)[0]
        
        x_j = x_temp[j]
        y_j = y_temp[j]
        # 计算随机取得的x_j与各原型向量的L2范数
        dist_lst = [np.linalg.norm(x_j - xi_v) for xi_v in prototype_v]
        i_v = list(map(dist_lst.index, heapq.nsmallest(1, dist_lst)))[0]
        #  更新原型向量
        if y_j == y_label_pro_v[i_v]:
            prototype_v[i_v] = prototype_v[i_v] + leaning_rate*(x_j-prototype_v[i_v])
        else:
            prototype_v[i_v] = prototype_v[i_v] - leaning_rate*(x_j-prototype_v[i_v])       
        max_iter -= 1
    
    # 保存所有训练数据的簇，以及预测的标签
    x_cluster_lst = []
    x_fit_label = []
    for x_i in x_temp:
        dist_lst = [np.linalg.norm(x_i - xi_v) for xi_v in prototype_v]
        i_cluster = list(map(dist_lst.index, heapq.nsmallest(1, dist_lst)))[0]
        x_i_cluster = str(y_label_pro_v[i_cluster])+'-'+str(i_cluster)
        x_cluster_lst.append(x_i_cluster)
        x_fit_label.append(y_label_pro_v[i_cluster])
        
    # 划分测试数据的簇和预测其标签
    x_test_cluster_lst = []
    x_test_fit_label = []
    try: 
        if not X_test:
            pass
    except:
        x_test_temp = np.array(X_test)
        for x_i in x_test_temp:
            dist_lst = [np.linalg.norm(x_i - xi_v) for xi_v in prototype_v]
            i_cluster = list(map(dist_lst.index, heapq.nsmallest(1, dist_lst)))[0]
            x_i_cluster = str(y_label_pro_v[i_cluster])+'-'+str(i_cluster)
            x_test_cluster_lst.append(x_i_cluster)
            x_test_fit_label.append(y_label_pro_v[i_cluster])
   
    return prototype_v, y_label_pro_v, x_cluster_lst, x_fit_label, x_test_cluster_lst, x_test_fit_label


prototype_v, y_label_pro_v, x_cluster_lst, x_fit_label, x_test_cluster_lst, x_test_fit_label = \
LVQ_ML(x_train, y_train, prototype_num = 20, leaning_rate = 0.1, max_iter = 1000, X_test = x_test, balanced=True)

print(metrics.adjusted_rand_score(y_train['Class'], x_fit_label))
print(classification_report(y_train['Class'], x_fit_label))
print(metrics.adjusted_rand_score(y_test['Class'], x_test_fit_label))
print(classification_report(y_test['Class'], x_test_fit_label))

0.8922290101649087
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       0.97      0.91      0.94        32
           2       0.92      0.97      0.95        37

   micro avg       0.96      0.96      0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        13

   micro avg       1.00      1.00      1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [410]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [424]:
# 学习向量量化，个人撰写-把随机选择一个向量去更新原型向量改成了遍历
import heapq
def LVQ_ML(X, Y, prototype_num = 10, leaning_rate = 0.1, max_iter = 200, balanced=False, X_test=None):
    '''
    X             ：训练数据的特征向量矩阵，DataFrame
    Y             ：训练数据的标签，一列DataFrame
    prototype_num ：原型向量的预设类别标记个数（通常大于标签类别个数）
    leaning_rate  ：原型向量的更新学习率
    max_iter      ：最大迭代轮数
    balanced      ：是否增大少量样本的抽取概率
    X_test        ：测试数据的特征向量矩阵，DataFrame
    output        ：prototype_v：原型向量
                    y_label_pro_v：原型向量对应的标签
                    x_cluster_lst：所有x的簇，即子类
                    x_fit_label：预测的x的标签
    较适用均衡数据，数值型特征向量，属性型标签
    '''
    x_temp = np.array(X)
    y_temp = np.array(Y).ravel()
    
    y_label_lst = list(set(y_temp.tolist()))
    y_label_num = len(y_label_lst)
    
    # 计算每个标签的子簇数量，平均分配（故适用平衡数据）
    prototype_num_i = prototype_num//y_label_num
    prototype_num_lst = [prototype_num_i for i in range(y_label_num)]
    
    for i in range(prototype_num-prototype_num_i*y_label_num):
        prototype_num_lst[i] += 1
    
    # 将每个标签对应的数据拆分为多个Array
    def cut_with_label(df, label):
        df_dct = {}
        labels_lst = list(set(df[label].tolist()))
        for yi in labels_lst:
            df_temp = df[df[label].isin([yi])]
            del df_temp[label]
            df_temp = np.array(df_temp)
            df_dct[yi] = df_temp
        n_list = []
        for key in df_dct.keys():
            n_list.append(len(df_dct[key]))
        return df_dct
    df_dct = cut_with_label(pd.concat([X,Y],axis=1),Y.columns[0])
    
    # 随机生成初始化原型向量
    y_label_pro_v = []
    for n in range(len(y_label_lst)):
        # 读取该标签对应的特征向量矩阵
        df_temp = df_dct[y_label_lst[n]]
        pro_num_for_yi = prototype_num_lst[n]
        pro_id_for_yi = np.random.choice(a=len(df_temp), size=pro_num_for_yi, replace=False, p=None)
        # 保存初始化原型向量的标签
        for cnt in range(pro_num_for_yi):
            y_label_pro_v.append(y_label_lst[n])
        # 合并原型向量
        try:
            prototype_v = np.concatenate((prototype_v,df_temp[list(pro_id_for_yi),:]),axis=0)
        except:
            prototype_v = df_temp[list(pro_id_for_yi),:]
    
    # 开始迭代更新原型向量    
    while max_iter > 0:
        
        for j in range(len(x_temp)):
            x_j = x_temp[j]
            y_j = y_temp[j]
            # 计算随机取得的x_j与各原型向量的L2范数
            dist_lst = [np.linalg.norm(x_j - xi_v) for xi_v in prototype_v]
            i_v = list(map(dist_lst.index, heapq.nsmallest(1, dist_lst)))[0]
            #  更新原型向量
            if y_j == y_label_pro_v[i_v]:
                prototype_v[i_v] = prototype_v[i_v] + leaning_rate*(x_j-prototype_v[i_v])
            else:
                prototype_v[i_v] = prototype_v[i_v] - leaning_rate*(x_j-prototype_v[i_v])       
        max_iter -= 1
    
    # 保存所有训练数据的簇，以及预测的标签
    x_cluster_lst = []
    x_fit_label = []
    for x_i in x_temp:
        dist_lst = [np.linalg.norm(x_i - xi_v) for xi_v in prototype_v]
        i_cluster = list(map(dist_lst.index, heapq.nsmallest(1, dist_lst)))[0]
        x_i_cluster = str(y_label_pro_v[i_cluster])+'-'+str(i_cluster)
        x_cluster_lst.append(x_i_cluster)
        x_fit_label.append(y_label_pro_v[i_cluster])
        
    # 划分测试数据的簇和预测其标签
    x_test_cluster_lst = []
    x_test_fit_label = []
    try: 
        if not X_test:
            pass
    except:
        x_test_temp = np.array(X_test)
        for x_i in x_test_temp:
            dist_lst = [np.linalg.norm(x_i - xi_v) for xi_v in prototype_v]
            i_cluster = list(map(dist_lst.index, heapq.nsmallest(1, dist_lst)))[0]
            x_i_cluster = str(y_label_pro_v[i_cluster])+'-'+str(i_cluster)
            x_test_cluster_lst.append(x_i_cluster)
            x_test_fit_label.append(y_label_pro_v[i_cluster])
   
    return prototype_v, y_label_pro_v, x_cluster_lst, x_fit_label, x_test_cluster_lst, x_test_fit_label


prototype_v, y_label_pro_v, x_cluster_lst, x_fit_label, x_test_cluster_lst, x_test_fit_label = \
LVQ_ML(x_train, y_train, prototype_num = 20, leaning_rate = 0.1, max_iter = 10, X_test = x_test, balanced=True)

print(metrics.adjusted_rand_score(y_train[y_train.columns[0]], x_fit_label))
print(classification_report(y_train[y_train.columns[0]], x_fit_label))
print(metrics.adjusted_rand_score(y_test[y_train.columns[0]], x_test_fit_label))
print(classification_report(y_test[y_test.columns[0]], x_test_fit_label))

0.9443455220828789
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       0.97      0.97      0.97        32
           2       0.97      0.97      0.97        37

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105

0.9312926240202837
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.95      1.00      0.97        18
           2       1.00      0.92      0.96        13

   micro avg       0.98      0.98      0.98        45
   macro avg       0.98      0.97      0.98        45
weighted avg       0.98      0.98      0.98        45



In [163]:
max_p = 0
max_x = 0
for x in [x for x in range(2,35)]:
    carCluster = GaussianMixture(n_components=x, 
                             covariance_type='full', 
                             tol=0.001, 
                             reg_covar=1e-06, 
                             max_iter=100, 
                             n_init=1, 
                             init_params='kmeans', 
                             weights_init=None, 
                             means_init=None, 
                             precisions_init=None, 
                             random_state=1, 
                             warm_start=False, 
                             verbose=0, 
                             verbose_interval=10)
    carCluster.fit(x_temp)
    
    if max_p < metrics.adjusted_rand_score(carCluster.predict(x_temp), y_temp[6]):
        max_x = x
        max_p = metrics.adjusted_rand_score(carCluster.predict(x_temp), y_temp[6])
max_x

KeyError: 6