In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import json
import codecs
import jieba
import numpy

In [2]:
def news_vector_dict(title_scale, doc_scale):
    file = codecs.open('_news_data.json', 'r', 'utf-8')
    news_dict = json.load(file)
    
    # 分词，在词之间加空格，重新组成文章
    i = 0
    title_array = []
    doc_array = []
    for news_key in news_dict:
        title_text = news_dict[news_key][0]
        doc_text = news_dict[news_key][1]
        title = ' '.join(jieba.lcut(title_text))
        doc = ' '.join(jieba.lcut(doc_text))
        title_array.append(title)
        doc_array.append(doc)
        i += 1
        if i % 1000 == 0:
            print(i)
    
    # tf-idf算法，文章转化为一个归一化的向量
    tfidf_vectorizer = TfidfVectorizer(min_df = 10)
    doc_matrix = tfidf_vectorizer.fit_transform(doc_array)
    title_matrix = tfidf_vectorizer.transform(title_array)
    
    # 计算文章加权vector
    news_matrix = (title_matrix.todense() * title_scale + doc_matrix.todense() * doc_scale).tolist()
    
    # 构建news_key : vector字典
    i = 0
    news_vector_dict = {}
    for news_key in news_dict:
#         news_vector = title_matrix[i].todense()
        news_vector_dict.setdefault(news_key, news_matrix[i])
        i += 1
        if i % 1000 == 0:
            print('i='+str(i))
            print(news_matrix[i][:10])
    # file_output = codecs.open('_news_data_tfidf.json', 'w', 'utf-8')
    # json.dump(news_vector_dict, file_output)
    # print(tfidf_vectorizer.vocabulary_) 
    
    return news_vector_dict

In [3]:
def user_vector_dict(news_vector_dict, time_scalse):
    file = codecs.open('_user_data_training.json', 'r', 'utf-8')
    user_dict = json.load(file)
    
    j = 0
    user_vector_dict = {}
    # 每一个用户
    for user_key in user_dict:
        # 该用户读过的所有新闻的向量和为用户向量
        i = 0
        vector_sum = numpy.matrix('0.0')
        for user_news_key in user_dict[user_key]:
            vector = numpy.matrix(news_vector_dict[user_news_key])
            vector_sum = vector * time_scale + vector_sum
            i += time_scale
        if i != 0:
            vector_sum /= i
        user_vector_dict.setdefault(user_key, vector_sum.tolist()[0])
        j += 1
        if j % 1000 == 0:
            print('j='+str(j))
            print(vector_sum.tolist()[0][:10])
    return user_vector_dict

In [4]:
def k_n_n(news_dict, user_dict, k):
    news_keys = []
    news = []
    i = 0
    for news_key in news_dict:
        news_keys.append(news_key)
        news.append(news_dict[news_key])
        i += 1
        if i % 1000 == 0:
            print(i)
    
    print("training...")
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(news)
    
    user_keys = []
    nbrs = []
    i = 0
    for user_key in user_dict:
        users = []
        user_keys.append(user_key)
        users.append(user_dict[user_key])
        nbrs += neigh.kneighbors(users)
        i += 1
        if i % 50 == 0:
            print(i)
    
#     n = neigh.kneighbors(users)
    print(nbrs[:10])
    return nbrs

In [5]:
title_scale = 0.5
doc_scale = 1.0 - title_scale
time_scale = 1.0
k = 10
news_vector_dict = news_vector_dict(title_scale, doc_scale)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\49325\AppData\Local\Temp\jieba.cache
Loading model cost 2.297 seconds.
Prefix dict has been built succesfully.


1000
2000
3000
4000
5000
6000
i=1000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=2000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=3000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=4000
[0.10698909037437322, 0.029850505179614607, 0.0, 0.0028634010684771313, 0.001411267289966212, 0.0028681237181149596, 0.002877685310018286, 0.0, 0.0014270344823457477, 0.00146138260933671]
i=5000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=6000
[0.027122530457869668, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
user_vector_dict = user_vector_dict(news_vector_dict, time_scale)

j=1000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=2000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=3000
[0.0, 0.0, 0.02398540266471601, 0.0007732613785927483, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=4000
[0.0, 0.0, 0.0, 0.002411076522521961, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=5000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=6000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=7000
[0.0, 0.0, 0.02098622890012179, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=8000
[0.0, 0.0, 0.006535560877925954, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=9000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
n = k_n_n(news_vector_dict, user_vector_dict, k)

1000
2000
3000
4000
5000
6000
training...
1000
2000


In [None]:
print(1)