导入必要的工具包

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
#降维算法
from sklearn.decomposition import PCA
import time

import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
#读取训练数据
train = pd.read_csv('events.csv')
#显示头5行信息
train.head()

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


In [7]:
#删除前面9列
train = train.drop(['user_id','start_time','city','state','zip','country','lat','lng'],axis=1)

In [8]:
#查看头5行信息
train.head()

Unnamed: 0,event_id,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,2,0,2,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,9
1,244999119,2,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,1,0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1,1,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,9


In [10]:
#设置拿1000个数据训练
n_trains = 10000
#拿到前1000条数据
y_train = train.event_id.values[:n_trains]
#从训练集中删除这一列的前1000条
X_train = train.drop("event_id",axis=1).values[:n_trains]

In [11]:
# 将训练集合拆分成训练集和校验集，在校验集上找到最佳的模型超参数（PCA的维数）
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train,y_train, train_size = 0.8,random_state = 0)



In [12]:
#拆分后的训练集和校验集的样本数目
print(X_train_part.shape)
print(X_val.shape)

(8000, 101)
(2000, 101)


In [13]:
# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能
def K_cluster_analysis(K, X_train, y_train, X_val, y_val):
#K  分成多少类的值
#X_train_part  x轴的训练集
#y_train_part  y轴的训练集
#X_val  x轴的测试集
 #y_val  y轴的测试集
    
    #定义开始时间
    start = time.time()
    
    #输出
    print("K-means begin with clusters: {}".format(K));
    
    #K-means,在训练集上训练
    mb_kmeans = MiniBatchKMeans(n_clusters = K)
    #n_clusters = K  定义有多少类
    
    #训练模型
    mb_kmeans.fit(X_train)
    #X_train x轴的训练数据
    
    # 在训练集和测试集上测试
    #y_train_pred = mb_kmeans.fit_predict(X_train)
    y_val_pred = mb_kmeans.predict(X_val)
    #X_val x轴的测试数据
    
    #以前两维特征打印训练数据的分类结果
    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)
    #plt.show()

    # K值的评估标准
    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index
    #这两个分数值越大则聚类效果越好
    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))
    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))#内部评价
    #X_train  x轴的训练数据
    #mb_kmeans.predict(X_train)  模型预测的结果
    
    #也可以在校验集上评估K
    v_score = metrics.v_measure_score(y_val, y_val_pred)#外部评价
    #y_val  x轴的测试集
    #y_val_pred  y轴的测试集
    
    #定义结束时间
    end = time.time()
    #输出
    print("CH_score: {}, time elaps:{}".format(CH_score, int(end-start)))
    print("v_score: {}".format(v_score))
    
    return CH_score,v_score

In [14]:
# 设置超参数（聚类数目K）搜索范围
Ks = [10, 20, 30,40,50,60] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 10
CH_score: 0.39569289464851704, time elaps:3
v_score: 0.3809305098443002
K-means begin with clusters: 20
CH_score: 0.269848254660972, time elaps:1
v_score: 0.46377184775810965
K-means begin with clusters: 30
CH_score: 0.13743966700005497, time elaps:1
v_score: 0.5321909215368978
K-means begin with clusters: 40
CH_score: 0.09841154693106706, time elaps:1
v_score: 0.6007425749573503
K-means begin with clusters: 50
CH_score: 0.08536045289662411, time elaps:1
v_score: 0.6226931053241956
K-means begin with clusters: 60
CH_score: 0.07901461089559542, time elaps:1
v_score: 0.6647943223592996


从目前来看  最高分                                                                                                                             
K-means begin with clusters: 10                                                                                                               
CH_score: 0.39569289464851704, time elaps:3                                                                                                   
越小越高分  将分类变小

In [15]:
# 设置超参数（聚类数目K）搜索范围
Ks = [4, 5, 6,7,8,9] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 4
CH_score: 0.5644099549829634, time elaps:2
v_score: 0.20446121980959642
K-means begin with clusters: 5
CH_score: 0.5062562093259055, time elaps:1
v_score: 0.2694190593624402
K-means begin with clusters: 6
CH_score: 0.47515938326794527, time elaps:1
v_score: 0.31398696161576417
K-means begin with clusters: 7
CH_score: 0.42214638628746387, time elaps:2
v_score: 0.3527505552152644
K-means begin with clusters: 8
CH_score: 0.38741657639570415, time elaps:1
v_score: 0.3545084753026823
K-means begin with clusters: 9
CH_score: 0.42976883564530766, time elaps:1
v_score: 0.353721093074027


K-means begin with clusters: 4                                                                                                                 
CH_score: 0.5644099549829634, time elaps:2                                                                                                     
分为4类  分数超过50%  已经相似了