In [1]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras

In [2]:
from collections import Counter

# 基础数据加工

In [3]:
dt = pd.read_csv("../data/user2up.csv", encoding="utf_8_sig")
dt.rename(columns={"from": "user", "mid": "item"}, inplace=True)
dt2 = dt[["user", "item"]]
dt2 = dt2.astype(int)
dt2 = dt2[:200000]
# dt2 = dt2.sample(n=1000, random_state=123)

matrix_user2up = dt2.groupby("user").agg(list)

# items = {key: value for value, key in enumerate(set(dt2["item"]))}
items = set(dt2["item"])

In [4]:
users = set(dt2.user)
items = set(dt2.item)

In [5]:
def onehot_spec(raw):
    raw = set(raw)
    return [1 if item in raw else 0 for item in items]


matrix_user2up["vec"] = matrix_user2up["item"].apply(onehot_spec)

dt3 = matrix_user2up["vec"].sample(frac=1, random_state=123).values
dt3 = np.array([np.array(i) for i in dt3])

split_n = int(len(dt3) * 0.7)
X_train = dt3[:split_n]
X_test = dt3[split_n:]

In [6]:
dims = len(X_train[0])

# sklearn的KMeans(仅支持欧氏距离)

In [7]:
from sklearn.cluster import KMeans

In [8]:
X_train.shape

(1717, 69859)

In [9]:
model = KMeans(n_clusters=4)

In [10]:
model.fit(dt3)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [11]:
ys_sklearn_eu = model.predict(dt3)

In [12]:
ct_sklearn_ud = Counter(ys_sklearn_eu)

In [13]:
ct_sklearn_ud

Counter({1: 2444, 2: 7, 0: 1, 3: 1})

#### 欧氏距离对于这种高维稀疏的向量聚类效果并不好 

# NLTK的KMeans(支持欧式距离/余弦距离/自定义距离)

In [14]:
import nltk

In [15]:
from nltk.cluster.kmeans import KMeansClusterer

In [32]:
from nltk.cluster.util import cosine_distance,euclidean_distance

In [17]:
model = KMeansClusterer(num_means=20,distance=cosine_distance)
model.cluster(dt3[:])

In [42]:
def loss(array,model=model,distance=cosine_distance):
    return distance(model.classify(array),array)

In [44]:
list(map(loss,dt3))

  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


[array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([nan, nan, nan, ..., nan, nan, nan]),
 array([1.        , 1.        , 0.90715233, ..., 1.        , 1.        ,
        1.        ]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1. , 1. , 0.9, ..., 1. , 1. , 1. ]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([nan, nan, nan, ..., nan, nan, nan]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([nan, nan, nan, ..., nan, nan, nan]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ...

In [53]:
i=8
cosine_distance(model.means()[model.classify(dt3[i])],dt3[i])

0.8163196099408504

In [57]:
cosine_distance([1,2,3,4,5],[1,2,3,4,1])

0.15237091053115515

In [18]:
ys_NLTK_cos = [model.classify(i) for i in dt3]

In [19]:
ct_NLTK_cos = Counter(ys_NLTK_cos)

In [20]:
ct_NLTK_cos

Counter({8: 86,
         17: 518,
         11: 154,
         0: 174,
         6: 361,
         5: 36,
         2: 328,
         1: 49,
         9: 50,
         10: 50,
         16: 231,
         19: 40,
         7: 81,
         13: 37,
         4: 63,
         18: 105,
         14: 46,
         12: 34,
         3: 9,
         15: 1})

In [21]:
matrix_user2up["vec_class"] = ys_NLTK_cos

In [22]:
matrix_user2up

Unnamed: 0_level_0,item,vec,vec_class
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[335115, 2, 12, 9099524, 867152]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",8
2,"[1868902080, 623512391, 43855, 520811440, 3941...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",17
4,"[9617619, 7584632, 2026561407, 22721144, 48163...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",11
6,"[454560170, 488034462, 102774573, 346563107, 1...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0
8,"[319274146, 1724598, 389552703, 489412051, 142...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6
...,...,...,...
1825049883,"[473222648, 37090048, 14112152, 87300577, 3991...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6
1854407543,[12076317],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16
1893933522,[40857648],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11
1999942692,"[174501086, 320840078, 473222648, 37090048]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",17
