In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import json
import codecs
import jieba
import numpy

In [75]:
def news_vector_dict(file_root, title_scale, doc_scale, min_df):
    file = codecs.open(file_root, 'r', 'utf-8')
    news_dict = json.load(file)
    
    # 分词，在词之间加空格，重新组成文章
#     stop_file = codecs.open('./data/stop_words.txt', 'r', 'utf-8')
#     stop_list = stop_file.read().split('\n')
    i = 0
    title_array = []
    doc_array = []
    for news_key in news_dict:
        title_text = news_dict[news_key][0]
        doc_text = news_dict[news_key][1]
        title = ' '.join(jieba.lcut(title_text))
        txt = jieba.lcut(doc_text)
        k = len(txt)-1
        while k >= 0:
            if txt[k].isdigit():
                del(txt[k])
            k-=1
        doc = ' '.join(txt)
        title_array.append(title)
        doc_array.append(doc)
        i += 1
        if i % 1000 == 0:
            print(i)
    
    # tf-idf算法，文章转化为一个归一化的向量
    tfidf_vectorizer = TfidfVectorizer(min_df = min_df)
    doc_matrix = tfidf_vectorizer.fit_transform(doc_array)
    title_matrix = tfidf_vectorizer.transform(title_array)
    
    word_bag = {}
    for key in tfidf_vectorizer.vocabulary_:
        word_bag.setdefault(tfidf_vectorizer.vocabulary_[key], key)
    
    # 计算文章加权vector
    news_matrix = (title_matrix.todense() * title_scale + doc_matrix.todense() * doc_scale).tolist()
    
    # 构建news_key : vector字典
    i = 0
    news_vector_dict = {}
    for news_key in news_dict:
        news_vector_dict.setdefault(news_key, news_matrix[i])
        i += 1
        if i % 1000 == 0:
            print('i='+str(i))
            print(news_matrix[i][:10])
            
         #打印文章关键词和权重
#         if i < 5:
#             news_words = []
#             news_words_weight = []
#             for j in range(len(news_matrix[i])):
#                 if news_matrix[i][j] > 0:
#                     news_words.append(word_bag[j])
#                     news_words_weight.append(news_matrix[i][j])
#             print(news_words)
#             print(news_words_weight)
            
                    

    
    return news_vector_dict

file_root = './data/_news_data_clean.json'
title_scale = 0.5
doc_scale = 1.0 - title_scale
min_df = 20
_news_vector_dict = news_vector_dict(file_root, title_scale, doc_scale, min_df)

1000
2000
3000
i=1000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=2000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=3000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [71]:
def time_back(t):
    a = int(t-1393603200)
    return int(a / 86400)

def user_vector_dict(news_vector_dict):
    file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
    news_data = codecs.open('./data/_news_data_clean.json', 'r', 'utf-8')
    user_dict = json.load(file)
    news_d = json.load(news_data)
    
    j = 0
    user_vector_dict = {}
    # 每一个用户
    for user_key in user_dict:
        # 该用户读过的所有新闻的向量和为用户向量
        i = 0
        time_scale = 0
        vector_sum = numpy.matrix('0.0')
        for user_news_key in user_dict[user_key]:
            vector = numpy.matrix(news_vector_dict[user_news_key])
            time_scale = time_back(news_d[user_news_key][2]) / 5 + 1
            vector_sum = vector * 1 + vector_sum
            i += 1
        if i != 0:
            vector_sum /= i
        user_vector_dict.setdefault(user_key, vector_sum.tolist()[0])
        j += 1
        if j % 1000 == 0:
            print('j='+str(j))
            print(vector_sum.tolist()[0][:10])
    return user_vector_dict


_user_vector_dict = user_vector_dict(_news_vector_dict)

In [72]:
def k_n_n(news_dict, user_dict, k):
    news_keys = []
    news = []
    i = 0
    for news_key in news_dict:
        news_keys.append(news_key)
        news.append(news_dict[news_key])
        i += 1
        if i % 1000 == 0:
            print(i)
    
    print("training...")
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(news)
    
    user_keys = []
    nbrs = []
    i = 0
    for user_key in user_dict:
        users = []
        user_keys.append(user_key)
        users.append(user_dict[user_key])
        nbrs += neigh.kneighbors(users)
        i += 1
        if i % 50 == 0:
            print(i)
    
#     n = neigh.kneighbors(users)
    print(nbrs[:10])
    return nbrs

k = 30
n = k_n_n(_news_vector_dict, _user_vector_dict, k)

1000
2000
3000
training...
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
[array([[0.45325328, 0.54007946, 0.59499633, 0.60171832, 0.60352323,
        0.61098639, 0.61660176, 0.61660176, 0.61921042, 0.61989905,
        0.62249817, 0.63575839, 0.63614938, 0.63792183, 0.63904911,
        0.64105731, 0.64220264, 0.64352969, 0.64397362, 0.64495954,
        0.64577929, 0.64634141, 0.64835796, 0.64962686, 0.64971029,
        0.65040481, 0.65073285, 0.65129872, 0.65129872, 0.65236851]]), array([[1632, 2881, 2884, 3050, 1466,  406, 2900, 2062,    5, 2496, 2709,
        3198, 2863, 3286,    4,    3,  357, 1167, 1106,  409, 1529, 2492,
        1431, 3028, 2999, 1427, 3292, 1292, 2535, 1373]], dtype=int64), array([[0.17796406, 0.48240841, 0.48691549, 0.49153958, 0.49684406,
        0.49684406, 0.49701526, 0.49705475, 0.49792756, 0.4990649 ,
        0.49958164, 0.50177114, 0.50201981, 0.50226131, 0.5027183 ,
        0.50305854, 0.50413008, 0.50418205, 0.50487396, 0.50503168,
        0.

In [73]:
news_keys = []
i = 0
for news_key in _news_vector_dict:
    news_keys.append(news_key)

    
file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
user_news_dict = json.load(file)

result = {}
i = 0
lens = []
for user_key in user_news_dict:
    indices = n[2*i+1][0].tolist()
    user_news_keys = []
    for index in indices:
        user_news_key = news_keys[index]
        if user_news_key not in user_news_dict[user_key]:
            user_news_keys.append(user_news_key)
    result.setdefault(user_key, user_news_keys)
#     if i < 100:
#         print(len(result[user_key]))
#         print(result[user_key])
#         print(user_news_dict[user_key])
    i += 1
    lens.append(len(result[user_key]))
lens.sort()
print(lens[:10])

[23, 23, 24, 24, 24, 24, 25, 26, 26, 26]


In [74]:
print(1)
print(n[:6])
file_output = codecs.open('./data/tfidf_result.json', 'w', 'utf-8')
json.dump(result, file_output)
file_output.close()

1
[array([[0.45325328, 0.54007946, 0.59499633, 0.60171832, 0.60352323,
        0.61098639, 0.61660176, 0.61660176, 0.61921042, 0.61989905,
        0.62249817, 0.63575839, 0.63614938, 0.63792183, 0.63904911,
        0.64105731, 0.64220264, 0.64352969, 0.64397362, 0.64495954,
        0.64577929, 0.64634141, 0.64835796, 0.64962686, 0.64971029,
        0.65040481, 0.65073285, 0.65129872, 0.65129872, 0.65236851]]), array([[1632, 2881, 2884, 3050, 1466,  406, 2900, 2062,    5, 2496, 2709,
        3198, 2863, 3286,    4,    3,  357, 1167, 1106,  409, 1529, 2492,
        1431, 3028, 2999, 1427, 3292, 1292, 2535, 1373]], dtype=int64), array([[0.17796406, 0.48240841, 0.48691549, 0.49153958, 0.49684406,
        0.49684406, 0.49701526, 0.49705475, 0.49792756, 0.4990649 ,
        0.49958164, 0.50177114, 0.50201981, 0.50226131, 0.5027183 ,
        0.50305854, 0.50413008, 0.50418205, 0.50487396, 0.50503168,
        0.50565818, 0.50763239, 0.50787672, 0.50860142, 0.50860142,
        0.5086558 , 0.5087

In [None]:
# k = 10
# n = k_n_n(news_vector_dict, user_vector_dict, k)