In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import json
import codecs
import jieba
import numpy

In [2]:
def news_vector_dict(title_scale, doc_scale):
    file = codecs.open('./data/_news_data.json', 'r', 'utf-8')
    news_dict = json.load(file)
    
    # 分词，在词之间加空格，重新组成文章
    i = 0
    title_array = []
    doc_array = []
    for news_key in news_dict:
        title_text = news_dict[news_key][0]
        doc_text = news_dict[news_key][1]
        title = ' '.join(jieba.lcut(title_text))
        doc = ' '.join(jieba.lcut(doc_text))
        title_array.append(title)
        doc_array.append(doc)
        i += 1
        if i % 1000 == 0:
            print(i)
    
    # tf-idf算法，文章转化为一个归一化的向量
    tfidf_vectorizer = TfidfVectorizer(min_df = 10)
    doc_matrix = tfidf_vectorizer.fit_transform(doc_array)
    title_matrix = tfidf_vectorizer.transform(title_array)
    
    # 计算文章加权vector
    news_matrix = (title_matrix.todense() * title_scale + doc_matrix.todense() * doc_scale).tolist()
    
    # 构建news_key : vector字典
    i = 0
    news_vector_dict = {}
    for news_key in news_dict:
#         news_vector = title_matrix[i].todense()
        news_vector_dict.setdefault(news_key, news_matrix[i])
        i += 1
        if i % 1000 == 0:
            print('i='+str(i))
            print(news_matrix[i][:10])
    # file_output = codecs.open('./data/_news_data_tfidf.json', 'w', 'utf-8')
    # json.dump(news_vector_dict, file_output)
    # print(tfidf_vectorizer.vocabulary_) 
    
    return news_vector_dict

In [3]:
def user_vector_dict(news_vector_dict, time_scalse):
    file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
    user_dict = json.load(file)
    
    j = 0
    user_vector_dict = {}
    # 每一个用户
    for user_key in user_dict:
        # 该用户读过的所有新闻的向量和为用户向量
        i = 0
        vector_sum = numpy.matrix('0.0')
        for user_news_key in user_dict[user_key]:
            vector = numpy.matrix(news_vector_dict[user_news_key])
            vector_sum = vector * time_scale + vector_sum
            i += time_scale
        if i != 0:
            vector_sum /= i
        user_vector_dict.setdefault(user_key, vector_sum.tolist()[0])
        j += 1
        if j % 1000 == 0:
            print('j='+str(j))
            print(vector_sum.tolist()[0][:10])
    return user_vector_dict

In [4]:
def k_n_n(news_dict, user_dict, k):
    news_keys = []
    news = []
    i = 0
    for news_key in news_dict:
        news_keys.append(news_key)
        news.append(news_dict[news_key])
        i += 1
        if i % 1000 == 0:
            print(i)
    
    print("training...")
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(news)
    
    user_keys = []
    nbrs = []
    i = 0
    for user_key in user_dict:
        users = []
        user_keys.append(user_key)
        users.append(user_dict[user_key])
        nbrs += neigh.kneighbors(users)
        i += 1
        if i % 50 == 0:
            print(i)
    
#     n = neigh.kneighbors(users)
    print(nbrs[:10])
    return nbrs

In [5]:
title_scale = 0.5
doc_scale = 1.0 - title_scale
time_scale = 1.0
k = 30
news_vector_dict = news_vector_dict(title_scale, doc_scale)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\49325\AppData\Local\Temp\jieba.cache
Loading model cost 0.971 seconds.
Prefix dict has been built succesfully.


1000
2000
3000
4000
5000
6000
i=1000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=2000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=3000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=4000
[0.10698909037437322, 0.029850505179614607, 0.0, 0.0028634010684771313, 0.001411267289966212, 0.0028681237181149596, 0.002877685310018286, 0.0, 0.0014270344823457477, 0.00146138260933671]
i=5000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=6000
[0.027122530457869668, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [6]:
user_vector_dict = user_vector_dict(news_vector_dict, time_scale)

j=1000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
j=2000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [7]:
n = k_n_n(news_vector_dict, user_vector_dict, k)

1000
2000
3000
4000
5000
6000
training...
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
[array([[ 0.        ,  0.        ,  0.25776863,  0.85404403,  0.85404403,
         0.86739417,  0.90842569,  0.92664622,  0.93689037,  0.9447062 ,
         0.95010816,  0.95968765,  0.9630005 ,  0.96382443,  0.96448864,
         0.96483354,  0.96535871,  0.96535871,  0.96555509,  0.96706188,
         0.96746714,  0.96839948,  0.96915562,  0.97034038,  0.97233069,
         0.97252724,  0.9730162 ,  0.97368932,  0.97379358,  0.97426563]]), array([[  19, 3720,  219,  214, 2500, 3065, 4210, 5394, 5279, 5278, 3202,
        2241, 2641, 1262, 5403, 4904, 3378, 3549, 6144, 1014,  747, 1039,
         470, 4264, 3932, 4903, 3387,   54, 3566, 4209]], dtype=int64), array([[ 0.60023111,  0.61995155,  0.62072018,  0.62268472,  0.63030721

In [8]:
print(1)
print(n[2][0])

1
[ 0.60023111  0.61995155  0.62072018  0.62268472  0.63030721  0.64357507
  0.64367208  0.64374214  0.64435971  0.64435971  0.64506792  0.6459175
  0.64697805  0.64814171  0.64816593  0.64862018  0.64917753  0.6492669
  0.64952441  0.64977023  0.64977023  0.65086852  0.65144599  0.65154313
  0.65157597  0.65163065  0.65179022  0.65181153  0.65253021  0.65267913]


In [9]:
news_keys = []
i = 0
for news_key in news_vector_dict:
    news_keys.append(news_key)

result = {}
i = 0
for user_key in user_vector_dict:
    indices = n[2*i+1][0].tolist()
    user_news_keys = []
    for index in indices:
        user_news_keys.append(news_keys[index])
    result.setdefault(user_key, user_news_keys)
    if i < 10:
        print(result[user_key])
    i += 1

['100651212', '100651366', '100655938', '100651343', '100652595', '100651670', '100617625', '100624790', '100616121', '100617654', '100645709', '100651322', '100644633', '100624894', '100449335', '100455533', '100657748', '100659230', '100153736', '100630815', '100080339', '100642556', '100649192', '100625945', '100631958', '100440650', '100658165', '100647597', '100588389', '100616071']
['100637151', '100651469', '100646087', '100584580', '100616257', '100617625', '100641885', '100361319', '100659230', '100657748', '100625962', '100656839', '100496825', '100080339', '100513375', '100653804', '100623512', '100478130', '100366115', '100642758', '100642745', '100619425', '100658491', '100649198', '100654339', '100488567', '100128451', '100419825', '100644614', '100227074']
['100118850', '100642556', '100603397', '100647967', '100649190', '100653804', '100631958', '100455533', '100658491', '100640712', '100414674', '100259335', '100616121', '100654339', '100644633', '100496825', '10061762

In [13]:
file_output = codecs.open('./data/tfidf_result.json', 'w', 'utf-8')
json.dump(result, file_output)
file_output.close()