In [77]:
#导入必要的工具包
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
#降维算法
from sklearn.decomposition import PCA
import time
#保存数据
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [78]:
#读取两个文件的数据
user = pickle.load(open("FE_uniqueUserPairs.pkl", 'rb'))
event = pickle.load(open("FE_uniqueUserPairs.pkl", 'rb'))

In [79]:
userList = []
eventList = []

In [80]:
#迭代set集合  并且放到list中
for i in user:
    userList.append(i[0])
    eventList.append(i[1])

for i in event:
    userList.append(i[1])
    eventList.append(i[0])

In [81]:
#定义df
d = {'user': userList, 'event': eventList}
#生成df
train = pd.DataFrame(data=d)

In [82]:
#查看信息
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168102 entries, 0 to 168101
Data columns (total 2 columns):
event    168102 non-null int64
user     168102 non-null int64
dtypes: int64(2)
memory usage: 2.6 MB


In [83]:
#查看头5行信息
train.head()

Unnamed: 0,event,user
0,1415,1558
1,2929,814
2,1333,231
3,619,3301
4,3004,151


In [84]:
#获取y轴
y_train = train.event.values
#获取x轴
X_train = train.drop("event",axis=1).values

In [85]:
# 将训练集合拆分成训练集和校验集，在校验集上找到最佳的模型超参数（PCA的维数）
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train,y_train, train_size = 0.8,random_state = 0)
#X_train_part x轴的训练集
#X_val  x轴的测试集
#y_train_part  y轴的训练集
#y_val  y轴的测试集

#X_train_pca x轴的训练数据
#y_train y轴的训练数据
#train_size = 0.8  训练集的比例
#random_state = 0  随机的次数

In [86]:
# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能
def K_cluster_analysis(K, X_train, y_train, X_val, y_val):
#K  分成多少类的值
#X_train_part  x轴的训练集
#y_train_part  y轴的训练集
#X_val  x轴的测试集
 #y_val  y轴的测试集
    
    #定义开始时间
    start = time.time()
    
    #输出
    print("K-means begin with clusters: {}".format(K));
    
    #K-means,在训练集上训练
    mb_kmeans = MiniBatchKMeans(n_clusters = K)
    #n_clusters = K  定义有多少类
    
    #训练模型
    mb_kmeans.fit(X_train)
    #X_train x轴的训练数据
    
    # 在训练集和测试集上测试
    #y_train_pred = mb_kmeans.fit_predict(X_train)
    y_val_pred = mb_kmeans.predict(X_val)
    #X_val x轴的测试数据
    
    #以前两维特征打印训练数据的分类结果
    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)
    #plt.show()

    # K值的评估标准
    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index
    #这两个分数值越大则聚类效果越好
    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))
    CH_score = metrics.silhouette_score(X_val,mb_kmeans.predict(X_val))#内部评价
    #X_train  x轴的训练数据
    #mb_kmeans.predict(X_train)  模型预测的结果
    
    #也可以在校验集上评估K
    v_score = metrics.v_measure_score(y_val, y_val_pred)#外部评价
    #y_val  x轴的测试集
    #y_val_pred  y轴的测试集
    
    #定义结束时间
    end = time.time()
    #输出
    print("CH_score: {}, time elaps:{}".format(CH_score, int(end-start)))
    print("v_score: {}".format(v_score))
    
    return CH_score,v_score

In [87]:
# 设置超参数（聚类数目K）搜索范围
Ks = [10, 20, 30,40,50,60] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 10
CH_score: 0.559912672103, time elaps:45
v_score: 0.0387616573781
K-means begin with clusters: 20
CH_score: 0.552669140797, time elaps:37
v_score: 0.0700010482957
K-means begin with clusters: 30
CH_score: 0.560231832784, time elaps:32
v_score: 0.0980143124088
K-means begin with clusters: 40
CH_score: 0.568096304738, time elaps:32
v_score: 0.1180809608
K-means begin with clusters: 50
CH_score: 0.587310174235, time elaps:31
v_score: 0.137917496815
K-means begin with clusters: 60
CH_score: 0.583607722191, time elaps:33
v_score: 0.154668108034


目前最好是K-means begin with clusters: 50     CH_score: 0.587310174235  继续调整

In [88]:
# 设置超参数（聚类数目K）搜索范围
Ks = [70, 80, 90,100,110,120] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 70
CH_score: 0.57883538169, time elaps:33
v_score: 0.167622306394
K-means begin with clusters: 80
CH_score: 0.607575069544, time elaps:33
v_score: 0.181091002062
K-means begin with clusters: 90
CH_score: 0.600665797421, time elaps:33
v_score: 0.193154354225
K-means begin with clusters: 100
CH_score: 0.623605251293, time elaps:33
v_score: 0.203393562945
K-means begin with clusters: 110
CH_score: 0.620943674947, time elaps:33
v_score: 0.212377025987
K-means begin with clusters: 120
CH_score: 0.627374462074, time elaps:32
v_score: 0.222640070395


目前最好是K-means begin with clusters: 120 CH_score: 0.627374462074