In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import json
import codecs
import jieba
import numpy
import math

In [36]:
def news_vector_dict(file_root, title_scale, doc_scale, min_df):
    file = codecs.open(file_root, 'r', 'utf-8')
    news_dict = json.load(file)
    
    # 分词，在词之间加空格，重新组成文章
#     stop_file = codecs.open('./data/stop_words.txt', 'r', 'utf-8')
#     stop_list = stop_file.read().split('\n')
    i = 0
    title_array = []
    doc_array = []
    for news_key in news_dict:
        title_text = news_dict[news_key][0]
        doc_text = news_dict[news_key][1]
        title = ' '.join(jieba.lcut(title_text))
        txt = jieba.lcut(doc_text)
        k = len(txt)-1
        while k >= 0:
            if txt[k].isdigit():
                del(txt[k])
            k-=1
        doc = ' '.join(txt)
        title_array.append(title)
        doc_array.append(doc)
        i += 1
        if i % 1000 == 0:
            print(i)
    
    # tf-idf算法，文章转化为一个归一化的向量
    tfidf_vectorizer = TfidfVectorizer(min_df = min_df)
    doc_matrix = tfidf_vectorizer.fit_transform(doc_array)
    title_matrix = tfidf_vectorizer.transform(title_array)
    
    word_bag = {}
    for key in tfidf_vectorizer.vocabulary_:
        word_bag.setdefault(tfidf_vectorizer.vocabulary_[key], key)
    
    # 计算文章加权vector
    news_matrix = (title_matrix.todense() * title_scale + doc_matrix.todense() * doc_scale).tolist()
    
    # 构建news_key : vector字典
    i = 0
    news_vector_dict = {}
    for news_key in news_dict:
        news_vector_dict.setdefault(news_key, news_matrix[i])
        i += 1
        if i % 1000 == 0:
            print('i='+str(i))
            print(news_matrix[i][:10])
            
         #打印文章关键词和权重
#         if i < 5:
#             news_words = []
#             news_words_weight = []
#             for j in range(len(news_matrix[i])):
#                 if news_matrix[i][j] > 0:
#                     news_words.append(word_bag[j])
#                     news_words_weight.append(news_matrix[i][j])
#             print(news_words)
#             print(news_words_weight)
            
                    

    
    return news_vector_dict

file_root = './data/_news_data_clean.json'
title_scale = 0.5
doc_scale = 1.0 - title_scale
min_df = 10
_news_vector_dict = news_vector_dict(file_root, title_scale, doc_scale, min_df)

1000
2000
3000
i=1000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=2000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
i=3000
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [37]:
def time_back(t):
    a = int(t-1393603200)
    return int(a / 86400)

def user_vector_dict(news_vector_dict):
    file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
    news_data = codecs.open('./data/_news_data_clean.json', 'r', 'utf-8')
    user_dict = json.load(file)
    news_d = json.load(news_data)
    
    j = 0
    user_vector_dict = {}
    # 每一个用户
    for user_key in user_dict:
        # 该用户读过的所有新闻的向量和为用户向量
        i = 0
        time_scale = 0
        vector_sum = numpy.matrix('0.0')
        for user_news_key in user_dict[user_key]:
            vector = numpy.matrix(news_vector_dict[user_news_key])
            time_scale = time_back(news_d[user_news_key][2]) / 5 + 1
            vector_sum = vector * 1 + vector_sum
            i += 1
        if i != 0:
            vector_sum /= i
        user_vector_dict.setdefault(user_key, vector_sum.tolist()[0])
        j += 1
        if j % 1000 == 0:
            print('j='+str(j))
            print(vector_sum.tolist()[0][:10])
    print("user_vector construction done.")
    return user_vector_dict


_user_vector_dict = user_vector_dict(_news_vector_dict)

user_vector construction done.


In [38]:
def k_n_n(news_dict, user_dict, k):
    news_keys = []
    news = []
    i = 0
    for news_key in news_dict:
        news_keys.append(news_key)
        news.append(news_dict[news_key])
        i += 1
        if i % 1000 == 0:
            print(i)
    
    print("training...")
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(news)
    
    user_keys = []
    nbrs = []
    i = 0
    for user_key in user_dict:
        users = []
        user_keys.append(user_key)
        users.append(user_dict[user_key])
        nbrs += neigh.kneighbors(users)
        i += 1
        if i % 50 == 0:
            print(i)
    
#     n = neigh.kneighbors(users)
#     print(nbrs[:10])
    print("knn done.")
    return nbrs

k = 200
n = k_n_n(_news_vector_dict, _user_vector_dict, k)

1000
2000
3000
training...
50
100
150
200
250
300
[array([[0.4843463 , 0.49923242, 0.49923242, 0.49937167, 0.50048612,
        0.50325024, 0.50398558, 0.50601532, 0.50862446, 0.5090344 ,
        0.50955878, 0.50956369, 0.50962888, 0.50972146, 0.5097946 ,
        0.51026635, 0.51081167, 0.51097381, 0.51146591, 0.511478  ,
        0.5115524 , 0.51222605, 0.51230726, 0.51241463, 0.51270848,
        0.51298387, 0.51308979, 0.51349313, 0.5135051 , 0.51394972,
        0.51402491, 0.51500726, 0.51512247, 0.51512601, 0.51515266,
        0.51540886, 0.51563137, 0.51607612, 0.51609549, 0.51654286,
        0.51680382, 0.51695255, 0.51698317, 0.51701548, 0.51703361,
        0.51713328, 0.51717778, 0.51726898, 0.5174104 , 0.51752082,
        0.51801161, 0.51801161, 0.51809833, 0.51813724, 0.51818123,
        0.51836942, 0.51836942, 0.51855081, 0.51881235, 0.51884288,
        0.51885917, 0.51890549, 0.51912082, 0.51912441, 0.51927158,
        0.5193249 , 0.51951427, 0.51953249, 0.5196044 , 0.5198097

In [47]:
def time_scale(t):
    day=int((t-1393603200)/86400)
    if day < 1:
        day = 1
    return (math.log(day)+1)

news_keys = []
i = 0
for news_key in _news_vector_dict:
    news_keys.append(news_key)

    
file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
f_news_data = codecs.open('./data/_news_data_clean.json', 'r', 'utf-8')
user_news_dict = json.load(file)
news_data = json.load(f_news_data)

result = {}
i = 0
lens = []
pair = []
for user_key in user_news_dict:
    dist = n[2*i][0].tolist()
    indices = n[2*i+1][0].tolist()
    pair = []
    for m in range(len(indices)):
        mth = indices[m]
        news_id = news_keys[mth]
        if news_id in user_news_dict[user_key]:
            continue
    
        time_ratio = time_scale(news_data[news_id][2])
        #print(time_ratio, dist[m], dist[m]*time_ratio)
        pair.append([dist[m] * time_ratio, news_id])
    
    pair.sort(key=lambda x:x[0],reverse=True)
    user_news_keys = []
    for k in range(30):
        user_news_keys.append(pair[k][1])
    result.setdefault(user_key, user_news_keys)
    if i < 10:
        for p in pair:
            print(p[0], time_scale(news_data[p[1]][2]))
        print("\n")
    
#         print(len(result[user_key]))
#         print(result[user_key])
#         print(user_news_dict[user_key])
    i += 1
    lens.append(len(result[user_key]))
lens.sort()
print(lens[:10])

3.152326000724074 4.401197381662156
3.1357500274781205 4.401197381662156
3.1039926646719724 4.367295829986475
3.0964431473873977 4.401197381662156
3.0685124363586556 4.295836866004329
3.0511085363998354 4.295836866004329
3.0413107300922637 4.295836866004329
3.0216418767973083 4.218875824868201
3.01451901294529 4.218875824868201
3.0127075422084713 4.295836866004329
2.9888558365630793 4.178053830347945
2.9611817628146513 4.13549421592915
2.961071726760908 4.13549421592915
2.959469895777927 4.178053830347945
2.8959749698061237 4.044522437723423
2.859440480151725 3.995732273553991
2.826983656730906 3.9444389791664403
2.8208490801882724 3.995732273553991
2.8123320214675567 3.995732273553991
2.7390003074438423 3.833213344056216
2.687364377372285 3.833213344056216
2.656934049695261 3.70805020110221
2.595009200067302 3.6390573296152584
2.5517692384722026 3.5649493574615367
2.533690618897249 3.5649493574615367
2.521522612912319 3.5649493574615367
2.4754504223245517 3.5649493574615367
2.40464275

0.7353378503239133 1.0
0.7352804610093279 1.0
0.7352530816040146 1.0
0.7352071955908257 1.0
0.735143781068262 1.0
0.7335664613837249 1.0
0.7330065336702407 1.0
0.732809049617488 1.0
0.7327459377473147 1.0
0.732474704066134 1.0
0.7323932078804393 1.0
0.7323379301594817 1.0
0.7322729909678498 1.0
0.7318141501057188 1.0
0.7316279413445556 1.0
0.7315585566846748 1.0
0.7315426093154077 1.0
0.7314243358450442 1.0
0.7314110513215308 1.0
0.7313851565078238 1.0
0.7311998354649007 1.0
0.7308887751888851 1.0
0.7306896188892374 1.0
0.729576685520538 1.0
0.7292695014709589 1.0
0.7279478901720202 1.0
0.7276746314887179 1.0
0.727601156710434 1.0
0.7263164565606794 1.0
0.7260999042289414 1.0
0.725604680071492 1.0
0.7251195281313624 1.0
0.7244565687449097 1.0
0.7241747667806097 1.0
0.7238107520710694 1.0
0.7237863040906319 1.0
0.7220185287058091 1.0
0.720173838935701 1.0
0.7109288561939143 1.0
0.7073504405862306 1.0
0.7070959727007928 1.0
0.7068267494570718 1.0
0.7020642077215653 1.0
0.7016688084923997

0.790049653686265 1.0
0.7882774092595783 1.0
0.7853239712459271 1.0
0.778702653747833 1.0
0.775590290005536 1.0
0.7746607915149603 1.0
0.7722285731528947 1.0
0.7712925261876536 1.0
0.7583254568844501 1.0
0.7530619213123757 1.0
0.7346671124373145 1.0
0.64369160420265 1.0
0.6436916042026499 1.0
0.6436916042026499 1.0
0.6436916042026498 1.0
0.6436916042026498 1.0
0.6436916042026498 1.0
0.6436916042026498 1.0
0.6436916042026497 1.0
0.6436916042026497 1.0
0.6435021874360414 1.0
0.6434092029290486 1.0
0.6433567256651966 1.0
0.6432894485433104 1.0
0.6431839033730096 1.0
0.643172290510117 1.0
0.643129250433879 1.0
0.6429749088398049 1.0
0.6428956571975709 1.0
0.6428727614760912 1.0
0.6427787253883787 1.0
0.6426275421193149 1.0
0.642596061341223 1.0
0.6424732216917185 1.0
0.6424155447510256 1.0
0.6423868834516742 1.0
0.6423215799665707 1.0
0.6422676601608662 1.0
0.6421995298414401 1.0
0.6420506359244136 1.0
0.6418071258778585 1.0
0.6417776413207551 1.0
0.6413624183755798 1.0
0.6413619293080424 

2.273212493462013 3.995732273553991
2.257344628659321 3.9444389791664403
2.2529092413237835 3.995732273553991
2.2164731759020215 3.1972245773362196
2.2164731759020215 3.1972245773362196
2.1506307979501575 3.772588722239781
2.0984259023606286 2.791759469228055
2.0959321872643346 3.772588722239781
2.0514783031637753 2.791759469228055
2.0064157153360567 3.5649493574615367
1.9651684526712676 2.6094379124341005
1.8999610704934267 3.3978952727983707
1.8324587511989843 3.1972245773362196
1.826512615844671 3.1972245773362196
1.7916829199712259 2.386294361119891
1.7625214499212791 2.386294361119891
1.5780623958194984 2.09861228866811
1.5768338100802266 2.09861228866811
1.5740109296241498 2.09861228866811
1.5516025605687174 2.09861228866811
1.5402691034263958 2.09861228866811
1.5047662100484849 2.6094379124341005
1.2144897112908357 2.09861228866811
1.1899604754568127 2.09861228866811
0.9666116854028738 1.6931471805599454
0.8857570904492794 1.6931471805599454
0.754416836872149 1.0
0.7543952330143

In [48]:
# print(1)
# print(n[:6])
file_output = codecs.open('./data/tfidf_result.json', 'w', 'utf-8')
json.dump(result, file_output)
file_output.close()

In [None]:
# k = 10
# n = k_n_n(news_vector_dict, user_vector_dict, k)