In [1]:
import math
import numpy as np 
filter_list = ['微博正文','网页链接','查看图片','查看话题详情']
topic_num = 10

In [2]:
##读取uid list
def load_uid():
    uid_list = []
    with open('../../data/uid.txt','r',encoding = 'utf-8') as f1:
        while True:
            line = f1.readline()
            if not line:
                break
            uid_list.append(line.strip())
        f1.close()
    return uid_list

In [3]:
##读取训练集
def load_train():
    topic = {}##全部话题
    uid_wid = {}
    w_topic = {}##微博-话题列表
    original = {}
    filename = '../../data/train/'+str(uid)+'.dat'
    with open(filename,'r',encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = eval(line)
            wid = line['id']
            userid = line['userid']
            t_list = line['topic']
            w_topic[wid] = t_list
            originalid = line['original']
            if 'id' not in originalid:
                original[wid] = wid
            else:
                original[wid] = originalid['id']
            if userid not in uid_wid:
                uid_wid[userid] = []
            uid_wid[userid].append(wid)
            for t in t_list:
                ##话题过滤
                t = t.strip().replace(' ','')
                t = t.replace('#','')
                t = t.lower()
                if len(t) < 2 or t.endswith('视频') or t.endswith('美拍') or t in filter_list:
                    continue
                if t not in topic:
                    topic[t] = 0
                topic[t] +=1
        f.close()
    topic_list = set()
    for t in topic.keys():
        if topic[t] > 2:##过滤低频标签
            topic_list.add(t)
    #print(topic_list)
    return uid_wid,original,topic_list,w_topic

In [4]:
##读取测试集
def load_test():
    test = []
    user_test = []
    filename = '../../data/test/'+str(uid)+'.dat'
    with open(filename,'r',encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = eval(line)
            wid = line['id']
            userid = line['userid']
            originalid = line['original']
            t_list = line['topic']
            w_topic[wid] = t_list 
            if 'id' not in originalid:
                original[wid] = wid
            else:
                original[wid] = originalid['id']
            if userid not in uid_wid:
                uid_wid[userid] = []
            uid_wid[userid].append(wid)
            test.append(wid)
            if userid == uid:
                user_test.append(wid)
        f.close()
    return test,user_test

In [5]:
##使用tf-idf算法计算标签重要度
def compute_tf_idf():
    tf = {}
    idf = {}
    for item in topic:
        tf[item] = 0
        idf[item] = 0
    ##compute tf:
    num = 0
    for w in uid_wid[uid]:
        t = set(w_topic[w]) ##set，去重
        tmp = topic & t ##取交集
        for item in tmp:
            tf[item] += 1
            num += 1
    for item in tf.keys():
        tf[item] /= num
    
    ##compute idf:
    n = len(uid_wid)
    for item in topic:##词典里的每个单词
        if tf[item] == 0:
            continue
        count = 0
        for u in uid_wid:##每个用户
            flag = 0
            for wid in uid_wid[u]:##微博文本是否包含该单词
                if item in w_topic[wid]:
                    flag = 1
                    break
            if flag == 1:
                count += 1
        idf[item] = math.log(n/(count+1))+1
    tfidf = {}
    for item in tf.keys():
        tfidf[item] = tf[item] * idf[item]
    return tfidf   

In [6]:
##保存兴趣话题
def save_result():
    user_tag = sorted(tfidf.items(),key=lambda x:x[1],reverse=True)
    filename = './user_interest_topic/'+ str(uid) + '_usertopic_by_tfidf.dat'
    with open(filename,'w',encoding = 'utf-8') as f:
        for item in user_tag:
            f.write(item[0] + '\t'  + str(item[1]) + '\n')
        f.close()

In [10]:
##保存召回结果
def save_recall_result(result):
    filename = './weibo_by_topic/'+str(uid)+'.dat'
    with open(filename,'w',encoding='utf-8') as f:
        for item in result:
            f.write(str(item)+'\n')
        f.close()

In [8]:
##获取召回命中结果
def get_result(result):
    #result = set(result)
    weibo = []##推荐集中用户产生行为的微博
    count = 0
    for wid in user_test:
        weibo.append(original[wid])
    for item in result:
        if item in weibo:
            count += 1
            hit_set.add(item)
    return count

In [14]:
def usertopic_recall():
    score = {}
    for i in test:
        score[i] = 0
        t = set(w_topic[i])
        tmp = topic & t
        #print(tmp)
        for item in tmp:
            score[i] += tfidf[item]
    sort_score = sorted(score.items(),key=lambda x:x[1],reverse=True)
    result = set()
    n = len(score)
    for i in range(0,n):
        if sort_score[i][1] <= 0:
            break
        result.add(original[sort_score[i][0]])  
        if len(result) >= num:
            break
    #print(len(result))
    hit_count = get_result(result)
    return result,hit_count

In [15]:
if __name__ == '__main__':
    uid_list = load_uid()
    for num in [10,20,30,40,50]:
        hit_count = 0
        hit_set = set()
        for uid in uid_list:
            uid = int(uid)
            uid_wid,original,topic,w_topic = load_train()  ##读取微博样本
            tfidf = compute_tf_idf()
            user_tag = sorted(tfidf.items(),key=lambda x:x[1],reverse=True)[0:20]
            topic = set()
            for i in range(0,topic_num):
                topic.add(user_tag[i][0])
            save_result() 
            test,user_test = load_test()
            result,count = usertopic_recall()
            hit_count += count
            save_recall_result(result)
        print("UT top{},recall: {}".format(num,hit_count/800)) 
        print("UT top{},precision: {}".format(num,hit_count/(num*20)))

UT top10,recall: 0.0675
UT top10,precision: 0.27
UT top20,recall: 0.07875
UT top20,precision: 0.1575
UT top30,recall: 0.0925
UT top30,precision: 0.12333333333333334
UT top40,recall: 0.11125
UT top40,precision: 0.11125
UT top50,recall: 0.11625
UT top50,precision: 0.093
