In [1]:
##使用TF-IDF算法计算单词权重
import math
import numpy as np 

In [2]:
##导入目标用户id
def load_uid():
    uid_list = []
    with open('../../data/uid.txt','r',encoding = 'utf-8') as f1:
        while True:
            line = f1.readline()
            if not line:
                break
            uid_list.append(line.strip())
        f1.close()
    return uid_list

In [3]:
##导入训练集
def load_train():
    uid_wid = {}
    filename = '../../data/train/'+str(uid)+'.dat'
    with open(filename,'r',encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = eval(line)
            w = line['id']
            userid = line['userid']
            if userid not in uid_wid:
                uid_wid[userid] = []
            uid_wid[userid].append(w)
        f.close()
    return uid_wid  

In [4]:
##导入微博文本
def load_text():
    text = {}
    filename = '../../data/text/after_filter/'+str(uid)+'.dat'
    with open(filename,'r',encoding='utf-8') as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            line = line.split(':')
            words = line[1].split(' ')
            words = set(words)
            words = list(words)
            #print(words)
            text[line[0]] = words
        f.close()
    for u in uid_wid:
        for w in uid_wid[u]:
            if w not in text:
                text[w] = []
    return text  

In [5]:
def get_word_list():
    word_list = []
    word_dict = {}
    num = 0
    for w in uid_wid[uid]:
        for item in text[w]:
            if item not in word_dict:
                word_dict[item] = num
                num += 1
            word_list.append(item)
    return word_list,word_dict       

In [6]:
##计算TF-IDF权重 
def compute_tf_idf():
    ##compute tf:
    tf = {}
    num = len(word_list)
    for item in word_dict:
        tf[item] = word_list.count(item)
        tf[item] /= num
    
    ##compute idf:
    idf = {}
    n = len(uid_wid)
    for item in word_dict:##词典里的每个单词
        count = 0
        for u in uid_wid:##每个用户
            flag = 0
            for wid in uid_wid[u]:##微博文本是否包含该单词
                if item in text[wid]:
                    flag = 1
                    break
            if flag == 1:
                count += 1
        idf[item] = math.log(n/(count+1))+1
    tfidf = {}
    for item in tf.keys():
        tfidf[item] = tf[item] * idf[item]
    return tfidf   

In [10]:
##保存结果
def save_result():
    user_tag = sorted(tfidf.items(),key=lambda x:x[1],reverse=True)
    filename = './user_interest_tag/'+ str(uid) + '_usertag_by_tfidf.dat'
    with open(filename,'w',encoding = 'utf-8') as f:
        for item in user_tag:
            f.write(item[0] + '\t'  + str(item[1]) + '\n')
        f.close()

In [11]:
if __name__ == '__main__':
    uid_list = load_uid()
    for uid in uid_list:
        uid = int(uid)
        uid_wid = load_train()  ##读取训练集
        text = load_text()  ##读取微博文本
        word_list,word_dict = get_word_list() ##获得单词列表
        tfidf = compute_tf_idf() ##计算TF-IDF权重
        save_result() ##保存结果