导入必要的工具包

In [74]:
#导入必要的工具包
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

In [75]:
#读取训练数据
train = pd.read_csv('train.csv')

#显示头5行信息
train.head()

Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0


In [76]:
#查看模型信息
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user              15398 non-null int64
event             15398 non-null int64
invited           15398 non-null int64
timestamp         15398 non-null object
interested        15398 non-null int64
not_interested    15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB


存在日期的类型，要对日期进行处理

In [77]:
def procdess_created_date(df):
    df['Date'] = pd.to_datetime(df['timestamp'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Wday'] = df['Date'].dt.dayofweek
    df['Yday'] = df['Date'].dt.dayofyear
    df['hour'] = df['Date'].dt.hour

    df.drop(['Date', 'timestamp'], axis=1,inplace = True)
    
procdess_created_date(train)

In [78]:
#对日期变换已经处理完毕
train.head()

Unnamed: 0,user,event,invited,interested,not_interested,Year,Month,Day,Wday,Yday,hour
0,3044012,1918771225,0,0,0,2012,10,2,1,276,15
1,3044012,1502284248,0,0,0,2012,10,2,1,276,15
2,3044012,2529072432,0,1,0,2012,10,2,1,276,15
3,3044012,3072478280,0,0,0,2012,10,2,1,276,15
4,3044012,1390707377,0,0,0,2012,10,2,1,276,15


In [79]:
#拿到event 的数据作为y轴
y_train = train.event.values
#把y轴的数据删除  变为x轴
X_train = train.drop("event",axis=1).values

In [80]:
# 将训练集合拆分成训练集和校验集，在校验集上找到最佳的模型超参数（PCA的维数）
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train,y_train, train_size = 0.8,random_state = 0)
#X_train_part x轴的训练集
#X_val  x轴的测试集
#y_train_part  y轴的训练集
#y_val  y轴的测试集

#X_train_pca x轴的训练数据
#y_train y轴的训练数据
#train_size = 0.8  训练集的比例
#random_state = 0  随机的次数



In [81]:
# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能
def K_cluster_analysis(K, X_train, y_train, X_val, y_val):
#K  分成多少类的值
#X_train_part  x轴的训练集
#y_train_part  y轴的训练集
#X_val  x轴的测试集
 #y_val  y轴的测试集
    
    #定义开始时间
    start = time.time()
    
    #输出
    print("K-means begin with clusters: {}".format(K));
    
    #K-means,在训练集上训练
    mb_kmeans = MiniBatchKMeans(n_clusters = K)
    #n_clusters = K  定义有多少类
    
    #训练模型
    mb_kmeans.fit(X_train)
    #X_train x轴的训练数据
    
    # 在训练集和测试集上测试
    #y_train_pred = mb_kmeans.fit_predict(X_train)
    y_val_pred = mb_kmeans.predict(X_val)
    #X_val x轴的测试数据
    
    #以前两维特征打印训练数据的分类结果
    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)
    #plt.show()

    # K值的评估标准
    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index
    #这两个分数值越大则聚类效果越好
    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))
    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))#内部评价
    #X_train  x轴的训练数据
    #mb_kmeans.predict(X_train)  模型预测的结果
    
    #也可以在校验集上评估K
    v_score = metrics.v_measure_score(y_val, y_val_pred)#外部评价
    #y_val  x轴的测试集
    #y_val_pred  y轴的测试集
    
    #定义结束时间
    end = time.time()
    #输出
    print("CH_score: {}, time elaps:{}".format(CH_score, int(end-start)))
    print("v_score: {}".format(v_score))
    
    return CH_score,v_score

In [82]:
# 设置超参数（聚类数目K）搜索范围
Ks = [10, 20, 30,40,50,60] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 10
CH_score: 0.5474475130125527, time elaps:5
v_score: 0.3751376546194379
K-means begin with clusters: 20
CH_score: 0.5410452511969207, time elaps:4
v_score: 0.46921223188521866
K-means begin with clusters: 30
CH_score: 0.5422004710444529, time elaps:4
v_score: 0.5203384029528769
K-means begin with clusters: 40
CH_score: 0.5448450154279408, time elaps:4
v_score: 0.5533479665717842
K-means begin with clusters: 50
CH_score: 0.5415997319304634, time elaps:4
v_score: 0.5784238700954667
K-means begin with clusters: 60
CH_score: 0.5629123656281116, time elaps:4
v_score: 0.6017339128734643


从上述可以看得出，  分类越细，ch越高分  再次调优

In [83]:
# 设置超参数（聚类数目K）搜索范围
Ks = [70, 80, 90,100,110,120] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 70
CH_score: 0.5713840690362406, time elaps:4
v_score: 0.6171916675880449
K-means begin with clusters: 80
CH_score: 0.5689858533462515, time elaps:4
v_score: 0.6306109892776396
K-means begin with clusters: 90
CH_score: 0.5716900900865112, time elaps:4
v_score: 0.6457305439829645
K-means begin with clusters: 100
CH_score: 0.5753453933571943, time elaps:4
v_score: 0.6538107410616093
K-means begin with clusters: 110
CH_score: 0.5820187783302463, time elaps:4
v_score: 0.6647927973118071
K-means begin with clusters: 120
CH_score: 0.5664660941862446, time elaps:4
v_score: 0.6719151519931452


最高分为  K-means begin with clusters: 110
CH_score: 0.5820187783302463, time elaps:4

In [84]:
# 设置超参数（聚类数目K）搜索范围
Ks = [120, 130, 140,150,160,170] #定义要分多少类
CH_scores = []  #内部评价存储列表
v_scores = []   #外部评价存储列表


for K in Ks:
    #调用方法
    ch,v = K_cluster_analysis(K, X_train_part, y_train_part, X_val, y_val)
    #K  分成多少类的值
    #X_train_part  x轴的训练集
    #y_train_part  y轴的训练集
    #X_val  x轴的测试集
    #y_val  y轴的测试集
    
    #添加到内部评价列表中
    CH_scores.append(ch)
    #添加到外部评价列表中
    v_scores.append(v)

K-means begin with clusters: 120
CH_score: 0.5763526209585916, time elaps:4
v_score: 0.6706155836526632
K-means begin with clusters: 130
CH_score: 0.591822177236437, time elaps:5
v_score: 0.6810615202790458
K-means begin with clusters: 140
CH_score: 0.5858895379916694, time elaps:5
v_score: 0.6886498726366703
K-means begin with clusters: 150
CH_score: 0.559117536850289, time elaps:5
v_score: 0.6932579125787979
K-means begin with clusters: 160
CH_score: 0.5826479510100114, time elaps:5
v_score: 0.7007066436570148
K-means begin with clusters: 170
CH_score: 0.5946312935996055, time elaps:6
v_score: 0.7052504078948593


最高分  K-means begin with clusters: 170
CH_score: 0.5946312935996055