In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import json
import codecs
import jieba
import numpy

In [2]:
def news_vector_dict(title_scale, doc_scale, min_df):
    file = codecs.open('./data/_news_data_clean.json', 'r', 'utf-8')
    news_dict = json.load(file)
    
    # 分词，在词之间加空格，重新组成文章
#     stop_file = codecs.open('./data/stop_words.txt', 'r', 'utf-8')
#     stop_list = stop_file.read().split('\n')
    i = 0
    title_array = []
    doc_array = []
    for news_key in news_dict:
        title_text = news_dict[news_key][0]
        doc_text = news_dict[news_key][1]
        title = ' '.join(jieba.lcut(title_text))
        doc = ' '.join(jieba.lcut(doc_text))
        title_array.append(title)
        doc_array.append(doc)
        i += 1
        if i % 1000 == 0:
            print(i)
    
    # tf-idf算法，文章转化为一个归一化的向量
    tfidf_vectorizer = TfidfVectorizer(min_df = min_df)
    doc_matrix = tfidf_vectorizer.fit_transform(doc_array)
    title_matrix = tfidf_vectorizer.transform(title_array)
    
    word_bag = {}
    for key in tfidf_vectorizer.vocabulary_:
        word_bag.setdefault(tfidf_vectorizer.vocabulary_[key], key)
    
    # 计算文章加权vector
    news_matrix = (title_matrix.todense() * title_scale + doc_matrix.todense() * doc_scale).tolist()
    
    # 构建news_key : vector字典
    i = 0
    news_vector_dict = {}
    for news_key in news_dict:
        news_vector_dict.setdefault(news_key, news_matrix[i])
        i += 1
        if i % 1000 == 0:
            print('i='+str(i))
            print(news_matrix[i][:10])
            
         #打印文章关键词和权重
        if i < 5:
            news_words = []
            news_words_weight = []
            for j in range(len(news_matrix[i])):
                if news_matrix[i][j] > 0:
                    news_words.append(word_bag[j])
                    news_words_weight.append(news_matrix[i][j])
            print(news_words)
            print(news_words_weight)
            
                    

    
    return news_vector_dict


In [3]:
def user_vector_dict(news_vector_dict, time_scalse):
    file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
    user_dict = json.load(file)
    
    j = 0
    user_vector_dict = {}
    # 每一个用户
    for user_key in user_dict:
        # 该用户读过的所有新闻的向量和为用户向量
        i = 0
        vector_sum = numpy.matrix('0.0')
        for user_news_key in user_dict[user_key]:
            vector = numpy.matrix(news_vector_dict[user_news_key])
            vector_sum = vector * time_scale + vector_sum
            i += time_scale
        if i != 0:
            vector_sum /= i
        user_vector_dict.setdefault(user_key, vector_sum.tolist()[0])
        j += 1
        if j % 1000 == 0:
            print('j='+str(j))
            print(vector_sum.tolist()[0][:10])
    return user_vector_dict

In [4]:
def k_n_n(news_dict, user_dict, k):
    news_keys = []
    news = []
    i = 0
    for news_key in news_dict:
        news_keys.append(news_key)
        news.append(news_dict[news_key])
        i += 1
        if i % 1000 == 0:
            print(i)
    
    print("training...")
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(news)
    
    user_keys = []
    nbrs = []
    i = 0
    for user_key in user_dict:
        users = []
        user_keys.append(user_key)
        users.append(user_dict[user_key])
        nbrs += neigh.kneighbors(users)
        i += 1
        if i % 50 == 0:
            print(i)
    
#     n = neigh.kneighbors(users)
    print(nbrs[:10])
    return nbrs

In [5]:
title_scale = 0.5
doc_scale = 1.0 - title_scale
news_vector_dict = news_vector_dict(title_scale, doc_scale)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\49325\AppData\Local\Temp\jieba.cache
Loading model cost 1.730 seconds.
Prefix dict has been built succesfully.


1000
2000
3000
['1980', '25', '32', '40', '一个', '一代', '一系列', '一项', '上升', '中心', '之间', '事实', '价值', '债务', '做法', '具有', '决定', '出现', '出生', '划分', '判断', '却是', '发生', '变化', '增幅', '大学', '大学生', '大小', '婴儿', '媒体', '完全', '实际', '工作', '差距', '平均', '年轻', '得出', '找到', '指标', '接受', '收入', '教育', '方式', '时间', '昂贵', '正确', '比较', '毕业生', '环境', '相对', '研究', '社交', '结论', '缓慢', '网络', '自然', '薪酬', '认为', '讨论', '误导', '越来越', '跟踪', '过去', '过多', '近期', '这种', '选择', '通常', '错误', '问题', '高中']
[0.029845115028515466, 0.01803140289681931, 0.025435842014741313, 0.018850919777438427, 0.011023592701523227, 0.12862944997497677, 0.021320855739038604, 0.021851072942620647, 0.019587776026269477, 0.017683037875730067, 0.016690442070188546, 0.024667501921694198, 0.06333467613026283, 0.022059550087459898, 0.022468182976959145, 0.017640951308844778, 0.015566549154949243, 0.014184338280095691, 0.1092002872589263, 0.02802906768474494, 0.021880392821754118, 0.029612319279960303, 0.015118315604948807, 0.01856446083804717, 0.028294357560700156, 0.263273

In [6]:
time_scale = 1.0
user_vector_dict = user_vector_dict(news_vector_dict, time_scale)

In [7]:
k = 30
n = k_n_n(news_vector_dict, user_vector_dict, k)

1000
2000
3000
training...
50
100
150
200
250
300
[array([[ 0.48201817,  0.4852383 ,  0.4898875 ,  0.49452283,  0.49658196,
         0.49723354,  0.49723354,  0.49979032,  0.4997991 ,  0.50098757,
         0.50156905,  0.50199566,  0.50207309,  0.50242897,  0.50342681,
         0.50502202,  0.5052207 ,  0.50566697,  0.50588811,  0.50628461,
         0.5063963 ,  0.50665471,  0.50686464,  0.50835184,  0.50868852,
         0.50909791,  0.50911172,  0.50996473,  0.51005997,  0.51022182]]), array([[  24,    0, 1420, 2543,   23, 1410,  667, 2780,  802, 2375, 1753,
        1639, 2902,  567, 2002, 1378, 1225, 3151, 1927,  587, 1609,  586,
        2137, 2929,    9, 2778, 1754, 1491, 1972, 1204]], dtype=int64), array([[ 0.42379183,  0.50081175,  0.50087689,  0.50907114,  0.5173258 ,
         0.51772205,  0.53261236,  0.53451325,  0.53539364,  0.54114411,
         0.55036248,  0.5504629 ,  0.55468959,  0.55655561,  0.5581864 ,
         0.55857756,  0.55898425,  0.55968552,  0.56066305,  0.560822

In [8]:
# k = 10
# n = k_n_n(news_vector_dict, user_vector_dict, k)

In [9]:
print(1)
print(n[:6])

1
[array([[ 0.48201817,  0.4852383 ,  0.4898875 ,  0.49452283,  0.49658196,
         0.49723354,  0.49723354,  0.49979032,  0.4997991 ,  0.50098757,
         0.50156905,  0.50199566,  0.50207309,  0.50242897,  0.50342681,
         0.50502202,  0.5052207 ,  0.50566697,  0.50588811,  0.50628461,
         0.5063963 ,  0.50665471,  0.50686464,  0.50835184,  0.50868852,
         0.50909791,  0.50911172,  0.50996473,  0.51005997,  0.51022182]]), array([[  24,    0, 1420, 2543,   23, 1410,  667, 2780,  802, 2375, 1753,
        1639, 2902,  567, 2002, 1378, 1225, 3151, 1927,  587, 1609,  586,
        2137, 2929,    9, 2778, 1754, 1491, 1972, 1204]], dtype=int64), array([[ 0.42379183,  0.50081175,  0.50087689,  0.50907114,  0.5173258 ,
         0.51772205,  0.53261236,  0.53451325,  0.53539364,  0.54114411,
         0.55036248,  0.5504629 ,  0.55468959,  0.55655561,  0.5581864 ,
         0.55857756,  0.55898425,  0.55968552,  0.56066305,  0.56082206,
         0.56115694,  0.56181808,  0.5620074

In [10]:
news_keys = []
i = 0
for news_key in news_vector_dict:
    news_keys.append(news_key)

    
file = codecs.open('./data/_user_data_training_clean.json', 'r', 'utf-8')
user_news_dict = json.load(file)

result = {}
i = 0
lens = []
for user_key in user_news_dict:
    indices = n[2*i+1][0].tolist()
    user_news_keys = []
    for index in indices:
        user_news_key = news_keys[index]
        if user_news_key not in user_news_dict[user_key]:
            user_news_keys.append(user_news_key)
    result.setdefault(user_key, user_news_keys)
#     if i < 100:
#         print(len(result[user_key]))
#         print(result[user_key])
#         print(user_news_dict[user_key])
    i += 1
    lens.append(len(result[user_key]))
lens.sort()
print(lens[:10])

[22, 22, 24, 24, 24, 25, 25, 25, 25, 25]


In [11]:
file_output = codecs.open('./data/tfidf_result.json', 'w', 'utf-8')
json.dump(result, file_output)
file_output.close()