In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import json
import codecs
import jieba
import numpy
import math

In [169]:
def news_vector_dict(file_root, title_scale, doc_scale, min_df, max_df):
    file = codecs.open(file_root, 'r', 'utf-8')
    news_dict = json.load(file)
    
    # 停用词表
    stop_file = codecs.open('./data/stop_words.txt', 'r', 'utf-8')
    stop_list = stop_file.read().split('\n')
    stop_file.close()
    
    # 分词，在词之间加空格，重新组成文章
    i = 0
    title_array = []
    doc_array = []
    scale = max(int(title_scale / doc_scale), 1)
    for news_key in news_dict:
        title_text = news_dict[news_key][0]
        _title = jieba.lcut(title_text)
        for w in _title[:]:
            if w.split('.')[0].isdigit():
                _title.remove(w)
        title = ' '.join(_title)
        title_array.append(title)
        
        doc_text = news_dict[news_key][1]
        _doc = jieba.lcut(doc_text)
        for w in _doc[:]:
            if w.split('.')[0].isdigit():
                _doc.remove(w)
        doc = ' '.join(_doc)
        doc = title * scale + ' ' + doc
        doc_array.append(doc)
        
        i += 1
        if i % 1000 == 0:
            print(i)
    
    
    # tf-idf算法，文章转化为一个词向量
    tfidf_vectorizer = TfidfVectorizer(min_df = min_df, max_df = max_df, stop_words = stop_list)
    tfidf_vectorizer.fit(doc_array)
    doc_matrix = tfidf_vectorizer.transform(doc_array)
    news_matrix = doc_matrix.todense().tolist()
    
    word_bag = {}
    for key in tfidf_vectorizer.vocabulary_:
        word_bag.setdefault(tfidf_vectorizer.vocabulary_[key], key)
    
    # 构建news_key : vector字典
    i = 0
    news_vector_dict = {}
    for news_key in news_dict:
        news_vector_dict.setdefault(news_key, news_matrix[i])
        if i % 1000 == 0:
            print('i='+str(i))
            
#          #打印文章关键词和权重
#         if i < 15:
#             for j in range(len(news_matrix[i])):
#                 if news_matrix[i][j] > 0:
#                     print(word_bag[j] + ":" + str(news_matrix[i][j]))
#             print(doc_array[i])
#             print('-------------------------')
    
        i += 1

    return news_vector_dict

file_root = './data/_news_data_clean.json'
# NOTE: scale = MAX(int(title_scale / doc_scale), 1)
title_scale = 0.5
doc_scale = 1.0 - title_scale
min_df = 5
max_df = 30
_news_vector_dict = news_vector_dict(file_root, title_scale, doc_scale, min_df, max_df)

1000
2000
3000
i=0
中院:0.42324605220582406
出庭:0.2195163507837788
刘汉:0.3967040017082986
刘维:0.21666843574036906
包庇:0.2195163507837788
受审:0.4452717775193649
团伙:0.19994892124911232
开庭审理:0.19994892124911232
杀人:0.20339741049160634
纵容:0.20162777003609061
黑社会:0.39989784249822463
-------------------------
一纸:0.2670057590029303
两者之间:0.2702737509721344
含金量:0.2868281224520995
四类:0.28201926445839615
婴儿:0.2315245916667262
学费:0.2777175943993906
年长:0.29857363593836117
年龄段:0.28201926445839615
推移:0.2670057590029303
昂贵:0.24030718138045498
沉默:0.25151191737545664
谬误:0.29857363593836117
近来:0.2315245916667262
高低:0.24371149435676662
-------------------------
一向:0.06749037249740059
三人:0.06841434638874348
三名:0.07383983854014213
上司:0.168834101705305
中级法院:0.06841434638874348
中院:0.07463625852386783
五矿:0.07239501639944691
何华章:0.33766820341061
候补委员:0.0730948373994984
党风廉政:0.06583848630818762
公开审理:0.08263761075582235
内设:0.06890563377383516
减不增:0.07641569862069798
减退:0.07852032955089705
出版物:0.15283139724139597
分在:0.078

In [175]:
def time_back(t):
    a = int(t-1393603200)
    return float(a / 86400 / 20)

def user_vector_dict(news_vector_dict):
    file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
    news_data = codecs.open('./data/_news_data_clean.json', 'r', 'utf-8')
    user_dict = json.load(file)
    news_d = json.load(news_data)
    
    j = 0
    user_vector_dict = {}
    # 每一个用户
    for user_key in user_dict:
        # 该用户读过的所有新闻的向量和为用户向量
        i = 0
        vector_sum = numpy.matrix('0.0')
        for user_news_key in user_dict[user_key]:
            vector = numpy.matrix(news_vector_dict[user_news_key])
            vector_sum = vector + vector_sum
#             time_scale = time_back(news_d[user_news_key][2])
#             time_scale = 1
#             vector_sum = vector * time_scale + vector_sum
            i += 1
        if i != 0:
            vector_sum /= i
        user_vector_dict.setdefault(user_key, vector_sum.tolist()[0])
        j += 1
        if j % 1000 == 0:
            print('j='+str(j))
            print(vector_sum.tolist()[0][:10])
    return user_vector_dict


_user_vector_dict = user_vector_dict(_news_vector_dict)

In [217]:
def k_n_n(news_dict, user_dict, k):
    news_keys = []
    news = []
    i = 0
    for news_key in news_dict:
        news_keys.append(news_key)
        news.append(news_dict[news_key])
        i += 1
        if i % 1000 == 0:
            print(i)
    
    print("training...")
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(news)
    
    user_keys = []
    nbrs = []
    i = 0
    for user_key in user_dict:
        users = []
        user_keys.append(user_key)
        users.append(user_dict[user_key])
        nbrs += neigh.kneighbors(users)
        i += 1
        if i % 50 == 0:
            print(i)
    
    print(nbrs[:10])
    return nbrs

k = 500
n = k_n_n(_news_vector_dict, _user_vector_dict, k)

1000
2000
3000
training...
50
100
150
200
250
300
[array([[ 0.17700123,  0.17700123,  0.17700123,  0.17700123,  0.17700123,
         0.17700123,  0.17700123,  0.17700123,  0.17700123,  0.17700123,
         0.17700123,  0.94407266,  0.9465537 ,  0.94878542,  0.94900248,
         0.95033959,  0.95108368,  0.95322756,  0.95413093,  0.96213788,
         0.96375227,  0.96678936,  0.96781948,  0.96781948,  0.96781948,
         0.96892709,  0.97023141,  0.97285806,  0.97723729,  0.98123466,
         0.98172367,  0.98184627,  0.98222133,  0.98256536,  0.98285952,
         0.98323802,  0.98323802,  0.98341934,  0.985301  ,  0.985301  ,
         0.98592309,  0.98605049,  0.98609422,  0.98620597,  0.98674199,
         0.987591  ,  0.987591  ,  0.98783526,  0.9879925 ,  0.98876361,
         0.98881069,  0.98894329,  0.98905444,  0.98938074,  0.98963185,
         0.98990724,  0.99001619,  0.99009422,  0.99010861,  0.99021308,
         0.99055288,  0.99076653,  0.99088305,  0.99090413,  0.99106298,


In [221]:
def time_scale(t):
    day=int((t-1393603200)/86400)
    if day < 1:
        day = 1
    return (math.log(day)+1)

news_keys = []
i = 0
for news_key in _news_vector_dict:
    news_keys.append(news_key)

    
file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
f_news_data = codecs.open('./data/_news_data_clean.json', 'r', 'utf-8')
user_news_dict = json.load(file)
news_data = json.load(f_news_data)

result = {}
i = 0
lens = []
pair = []
for user_key in user_news_dict:
    dist = n[2*i][0].tolist()
    indices = n[2*i+1][0].tolist()
    pair = []
    for m in range(len(indices)):
        mth = indices[m]
        news_id = news_keys[mth]
        if news_id in user_news_dict[user_key] or news_data[news_id][2] < 1393603200:
            continue
    
        time_ratio = time_scale(news_data[news_id][2])
        #print(time_ratio, dist[m], dist[m]*time_ratio)
        pair.append([dist[m] * time_ratio, news_id])
    
    pair.sort(key=lambda x:x[0],reverse=True)
    user_news_keys = []
    for k in range(20):
        user_news_keys.append(pair[k][1])
    result.setdefault(user_key, user_news_keys)
    if i < 10:
        for p in pair:
            print(p[0], time_scale(news_data[p[1]][2]))
        print("\n")
    
#         print(len(result[user_key]))
#         print(result[user_key])
#         print(user_news_dict[user_key])
    i += 1
    lens.append(len(result[user_key]))
lens.sort()
print(lens[:10])



# news_keys = []
# i = 0
# for news_key in _news_vector_dict:
#     news_keys.append(news_key)

    
# file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
# user_news_dict = json.load(file)

# result = {}
# i = 0
# lens = []
# for user_key in user_news_dict:
#     indices = n[2*i+1][0].tolist()
#     user_news_keys = []
#     for index in indices:
#         user_news_key = news_keys[index]
#         if user_news_key not in user_news_dict[user_key]:
#             user_news_keys.append(user_news_key)
#     result.setdefault(user_key, user_news_keys)
# #     if i < 100:
# #         print(len(result[user_key]))
# #         print(result[user_key])
# #         print(user_news_dict[user_key])
#     i += 1
#     lens.append(len(result[user_key]))
# print(lens[:10])
# lens.sort()
# print(lens[:10])



4.40856644614594 4.367295829986475
4.380838680723331 4.332204510175204
4.377877470487477 4.332204510175204
4.344112777148162 4.295836866004329
4.34154002383973 4.295836866004329
4.329424368231723 4.295836866004329
4.325731661528345 4.295836866004329
4.31860740543447 4.401197381662156
4.314224563643093 4.295836866004329
4.302921605984047 4.258096538021482
4.302561578832276 4.258096538021482
4.2975036256235715 4.295836866004329
4.295617709678641 4.258096538021482
4.266372151868254 4.218875824868201
4.265850621091874 4.218875824868201
4.2615041593130725 4.218875824868201
4.249078997187283 4.218875824868201
4.246423947322808 4.218875824868201
4.241287158953861 4.218875824868201
4.224329353268246 4.178053830347945
4.223730889837655 4.178053830347945
4.220933391407815 4.178053830347945
4.214130224742192 4.178053830347945
4.2126693202419725 4.178053830347945
4.209599771051364 4.178053830347945
4.202697807238672 4.178053830347945
4.181631046652214 4.13549421592915
4.1783589683411275 4.13549421

1.740434181652749 1.6931471805599454
1.740434181652749 1.6931471805599454
1.7398509938538818 1.6931471805599454
1.7387915798637934 1.6931471805599454
1.7377342065790498 1.6931471805599454
1.7323978271968372 1.6931471805599454
1.730843416729448 1.6931471805599454
1.7107238151463797 1.6931471805599454
1.7047180758387868 1.6931471805599454
1.6818934340039884 1.6931471805599454
1.668330908263494 1.6931471805599454
1.668330908263494 1.6931471805599454
1.655176671365186 1.6931471805599454
1.1607318985193082 4.367295829986475
1.1212851009984468 4.218875824868201
1.1104355059525834 4.178053830347945
1.0294097909444972 1.0
1.0293532182795313 1.0
1.0291683887717937 1.0
1.029137644745349 1.0
1.029096423296488 1.0
1.0290277928419478 1.0
1.028957742044499 1.0
1.028911280159679 1.0
1.0285851020515149 1.0
1.0285282438212622 1.0
1.0284704281358652 1.0
1.0283324383621888 1.0
1.0283218423344114 1.0
1.0282264607958613 1.0
1.0282126159413658 1.0
1.0281514768484266 1.0
1.0281153560300829 1.0
1.028088429237

2.4199966642201733 2.386294361119891
2.4186339012491067 2.386294361119891
2.4186339012491067 2.386294361119891
2.4184593969479224 2.386294361119891
2.4107361684714674 2.386294361119891
2.4066516875178414 2.386294361119891
2.383107042989529 2.386294361119891
2.152759416907396 2.09861228866811
2.152024188621677 2.09861228866811
2.151616325914419 2.09861228866811
2.151110025183991 2.09861228866811
2.1510479589825695 2.09861228866811
2.147029718815434 2.09861228866811
2.1461043550204075 2.09861228866811
2.144975403958117 2.09861228866811
2.1440490518487803 2.09861228866811
2.1440490518487803 2.09861228866811
2.143813572871375 2.09861228866811
2.1407756828300397 2.09861228866811
2.1295056295006023 2.09861228866811
2.1289867576146118 2.09861228866811
2.06025323691631 2.09861228866811
2.0350685256964014 2.09861228866811
1.7391900351260743 1.6931471805599454
1.738434144556978 1.6931471805599454
1.737120547745207 1.6931471805599454
1.737120547745207 1.6931471805599454
1.7350515339936456 1.69314

1.843473124913974 1.6931471805599454
1.8430612270318647 1.6931471805599454
1.8430612270318647 1.6931471805599454
1.8402116178394243 1.6931471805599454
1.8251415559909576 1.6931471805599454
1.7909194311447167 1.6931471805599454
1.7187287185604292 1.6931471805599454
1.6965840868297037 1.6931471805599454
1.6582904627292614 3.70805020110221
1.6274359126077151 3.6390573296152584
1.5942938199256387 3.5649493574615367
1.5584976328334044 3.4849066497880004
1.5195849620804698 3.3978952727983707
1.1669761110535584 2.6094379124341005
1.0954451150103321 1.0
1.0954451150103321 1.0
1.0954451150103321 1.0
1.095445115010332 1.0
1.095445115010332 1.0
1.095445115010332 1.0
1.095445115010332 1.0
1.095445115010332 1.0
1.095445115010332 1.0
1.095445115010332 1.0
1.0954451150103317 1.0
1.0919608175040854 1.0
1.091450781113274 1.0
1.0898261240277096 1.0
1.0883123248592368 1.0
1.087483603524751 1.0
1.0868965288958652 1.0
1.0868965288958652 1.0
1.0867535048686776 1.0
1.086335536854895 1.0
1.086335536854895 1.0

4.61861480524545 4.401197381662156
4.617476042832318 4.401197381662156
4.586399413439315 4.367295829986475
4.586399413439315 4.367295829986475
4.579612027185539 4.367295829986475
4.573278227036772 4.367295829986475
4.547870342780054 4.332204510175204
4.545332250941488 4.332204510175204
4.5392135416897235 4.332204510175204
4.524239470477633 4.332204510175204
4.510261315466838 4.295836866004329
4.5088159437520385 4.295836866004329
4.5061677122290655 4.295836866004329
4.499307272653443 4.295836866004329
4.47091448521718 4.258096538021482
4.468250778255381 4.258096538021482
4.467319954539401 4.258096538021482
4.461620306119482 4.258096538021482
4.46001665616201 4.258096538021482
4.46001665616201 4.258096538021482
4.459809737572835 4.258096538021482
4.459378275682814 4.258096538021482
4.45927947120984 4.258096538021482
4.430933167717804 4.218875824868201
4.429601240125962 4.218875824868201
4.429580417194674 4.218875824868201
4.427455301584058 4.218875824868201
4.426705726039386 4.2188758248

In [222]:
file_output = codecs.open('./data/tfidf_result.json', 'w', 'utf-8')
json.dump(result, file_output)
file_output.close()

In [223]:
def time_back(t):
    a = int(t-1393603200)
    return int(a / 86400)

def test(result_root):
    f_user_data_validation = codecs.open('./data/_user_data_validation_clean.json', 'r', 'utf-8')
    f_result = codecs.open(result_root, 'r', 'utf-8')
    f_news_data = codecs.open('./data/_news_data.json', 'r', 'utf-8')
    f_user_data_training = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
    training = json.load(f_user_data_training)
    validation = json.load(f_user_data_validation)
    result = json.load(f_result)
    news_data = json.load(f_news_data)
    
    z=0
    q=0
    user_num = 0
    precision = 0
    recall = 0
    sum=0
    for key in validation:
        user_num += 1
        if key in result:
            rec_num = len(result[key])
        else:
            continue
        act_num = len(validation[key])
        TP = 0
        for news_id in result[key]:
            q+=1
    #         print(news_data[news_id][0])
            if news_id in validation[key]:
                TP+=1

#         print(key)
#         for a in validation[key]:
#             print(news_data[a][0])
#         print("\n")
#         for a in training[key]:
#             print(news_data[a][0])
#         print("\n")
        sum+=TP
        for a in validation[key]:
            if time_back(news_data[a][2]) < 10:
                z+=1
#         if TP == 0:
#             print(key)
#             for a in result[key]:
#                 print(news_data[a][0], time_back(news_data[a][2]))
#             print("\n")
#             for a in validation[key]:
#                 print(news_data[a][0], time_back(news_data[a][2]),time_back(validation[key][a]))
#             print("\n")
#             for a in training[key]:
#                 print(news_data[a][0], time_back(news_data[a][2]),time_back(training[key][a]))
#             print("\n")
#             break
    
#         print(precision, recall)
        precision += TP / rec_num
        recall += TP / act_num
    precision = precision / user_num 
    recall = recall / user_num
    print(z,q,sum)
    f_user_data_validation.close()
    f_result.close()
    print("precision: ", precision )
    print("recall: ", recall)

test('./data/tfidf_result.json')

846 6820 200
precision:  0.02932551319648094
recall:  0.031754580854845206
