In [1]:
##使用TextRank算法获取用户兴趣标签
import math
import numpy as np 
win = 5 #窗口大小

In [2]:
##导入uid list
def load_uid():
    uid_list = []
    with open('../../data/uid.txt','r',encoding = 'utf-8') as f1:
        while True:
            line = f1.readline()
            if not line:
                break
            uid_list.append(line.strip())
        f1.close()
    return uid_list

In [3]:
##读取训练集样本
def load_train():
    wid = []
    filename = '../../data/train/'+str(uid)+'.dat'
    with open(filename,'r',encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = eval(line)
            w = line['id']
            userid = line['userid']
            if userid == uid:
                wid.append(w)
            else:
                break
        f.close()
    return wid  

In [4]:
##读取用户训练集微博文本
def load_text():
    text = {}
    filename = '../../data/text/after_filter/'+str(uid)+'.dat'
    with open(filename,'r',encoding='utf-8') as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            line = line.split(':')
            words = line[1].split(' ')
            if line[0] in wid:
                text[line[0]] = words
            else:
                break
        f.close()
    return text  

In [16]:
##导入单词TF-IDF得分
def load_tfidf():
    tfidf = {}
    filename = './user_interest_tag/'+ str(uid) + '_usertag_by_tfidf.dat'
    with open(filename,'r',encoding = 'utf-8') as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            line = line.split('\t')
            tfidf[line[0]] = float(line[1])
        f.close()
    return tfidf

In [6]:
##初始化边权重
def init_weight():
    weight = {}
    for w in text.keys():
        n = len(text[w])
        for i in range(0,n):
            word = text[w][i]
            if word not in weight:
                weight[word] = {}
            for j in range(max(0,i-win+1),min(i+win,n)):
                if i==j:
                    continue
                word2 = text[w][j]
                if word2 not in weight[word]:
                    weight[word][word2] = 0
                weight[word][word2] += 1
                if word2 not in weight:
                    weight[word2] = {}
                if word not in weight[word2]:
                    weight[word2][word] = 0
                weight[word2][word] +=1
    return weight

In [7]:
##初始化结点输出值
def init_out(weight):
    out = {}
    for w1 in tfidf.keys():
        out[w1] = 0
        for w2 in weight[w1]:
            out[w1] += weight[w1][w2]
    return out

In [8]:
def textRank():
    d = 0.85  ##阻尼系数
    iterator = 100  ##迭代次数
    weight = init_weight()
    out = init_out(weight)
    value = {}
    for item in tfidf.keys():
        value[item] = (1-d) * tfidf[item]
    for i in range(0,iterator):##迭代次数
        for item in weight.keys():
            sum = 0 
            for word in weight[item]:
                sum += (weight[item][word] * value[word])/out[word]
            #value[item] = (1-d) * tfidf[item] + d * sum
            value[item] = (1-d)  + d * sum * (tfidf[item] + 1 )
    return value

In [9]:
# 提取数据集的单元素集合（大小为1的候选集的集合）
def createC1(dataSet):
    C1 = set()
    for transaction in dataSet:
        for item in transaction:
            C1.add(item)
    C1 = list(C1)
    C1.sort()
    # frozenset能作为key
    return map(lambda x:frozenset([x]), C1)

# 计算Ck中每项的支持度并过滤
def scanD(D, Ck, minSupport):
    ssCnt = {}
    for can in Ck:
        for tid in D:
            if can.issubset(tid):
                if can not in ssCnt:
                    ssCnt[can] = 1
                else:
                    ssCnt[can] += 1
    retList = [] # Lk
    supportData = {} # 支持度
    for key in ssCnt:
        support = ssCnt[key]
        if support >= minSupport:
            retList.append(key) # 阈值过滤
        supportData[key] = support
    return retList, supportData

In [10]:
# 构造下一个候选集Ck 
def aprioriGen(Lk, k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1 == L2: # 如果它们前k-2项相同
                retList.append(Lk[i] | Lk[j]) # 合并
    return retList

def apriori(dataSet, minSupport=3):
    C1 = createC1(dataSet)
    D = dataSet
    #D = map(set, dataSet)
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while len(L[k-2])>0:
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport) # 扫描并过滤
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

In [11]:
# 计算可信度
def calcConf(freqSet, H, supportData, br1, minConf=0.7):
    prunedH = []
    for conseq in H:
        if  len(freqSet - conseq) >1:
            continue
        conf = supportData[freqSet] / supportData[freqSet - conseq]
        if conf >= minConf: # 过滤
            # print "{0} --> {1} conf:{2}".format(freqSet - conseq, conseq, conf)
            br1.append((freqSet - conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
    m = len(H[0])
    if len(freqSet) > m+1:
        Hmp1 = aprioriGen(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
        if len(Hmp1)>1:
            rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)

def generateRules(L, supportData, minConf=0.8):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if i>1:
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList

In [12]:
##基于关联规则扩展成词串
def keyword_extend():
    user_tag = sorted(value.items(),key=lambda x:x[1],reverse=True)
    key = []##选取前20个关键词扩展
    word_fre = {} ##词频
    corr = {} ##词共现
    extend = {} ##扩展词串
    for i in range(0,20):
        key.append(user_tag[i][0])
        extend[user_tag[i][0]] = set()
        extend[user_tag[i][0]].add(user_tag[i][0])
    
    data = []
    for w in text.keys():
        num = len(text[w])
        tmp = set()
        for i in range(0,num):
            if text[w][i] in key:
                tmp.add(text[w][i])
        if len(tmp) < 2:
            continue
        data.append(tmp)
    
    ##关联规则aprioir算法
    L,supportData = apriori(data,2) ##L：频繁集
    rule = generateRules(L, supportData, minConf=0.7) ##关联规则 
    #print(supportData)
    print("关联规则：")
    print(rule)
    
    for item in rule:
        w1 = list(item[0])[0]
        w2 = list(item[1])
        for w in w2:
            extend[w1].add(w)
    #print(extend)
    result = {}
    for w1 in key:
        if len(extend[w1]) == 1: ##没有扩展单词
            result[w1] = value[w1]
            continue
        s = sorted(extend[w1])  ##扩展词串排序
        st=''
        for w2 in s:
            st += w2 + ',' ##单词用逗号分隔
        st = st[0:-1]
        if st not in result:
            result[st] = value[w1]
        else:
            result[st] += value[w1]
    #print(result)
    return result

In [13]:
##保存结果(对照组，未扩展词串)
def save_result():
    user_tag = sorted(result.items(),key=lambda x:x[1],reverse=True)
    filename = './user_interest_tag/'+ str(uid) + '_usertag_by_textrank.dat'
    with open(filename,'w',encoding = 'utf-8') as f:
        for item in user_tag:
            f.write(item[0] + '\t'  + str(item[1]) + '\n')
        f.close()

In [14]:
##保存结果(实验组，扩展词串)
def save_result_extend():
    user_tag = sorted(result.items(),key=lambda x:x[1],reverse=True)
    filename = './user_interest_tag/'+ str(uid) + '_usertag_by_textrank_extend.dat'
    with open(filename,'w',encoding = 'utf-8') as f:
        for item in user_tag:
            f.write(item[0] + '\t'  + str(item[1]) + '\n')
        f.close()

In [17]:
if __name__ == '__main__':
    uid_list = load_uid()
    for uid in uid_list:
        print("用户"+str(uid))
        uid = int(uid)
        wid = load_train()  ##读取训练集
        text = load_text() ##读取微博文本
        tfidf = load_tfidf() ##读取TF-IDF因子
        value = textRank() ##TextRank算法计算得分
        result = value
        save_result() ##保存结果，对照组
        result = keyword_extend() ##兴趣标签扩展
        save_result_extend() ##保存结果,扩展词串 

用户3927298812
关联规则：
[(frozenset({'callumturner'}), frozenset({'卡'}), 1.0), (frozenset({'卡'}), frozenset({'callumturner'}), 0.75), (frozenset({'地球'}), frozenset({'moss'}), 0.75), (frozenset({'moss'}), frozenset({'地球'}), 0.75), (frozenset({'基'}), frozenset({'锤'}), 1.0), (frozenset({'锤'}), frozenset({'基'}), 1.0), (frozenset({'杰克'}), frozenset({'角色'}), 1.0)]
用户5563654349
关联规则：
[(frozenset({'pang'}), frozenset({'唐人街'}), 0.7857142857142857), (frozenset({'pang'}), frozenset({'陈展鹏'}), 1.0), (frozenset({'陈展鹏'}), frozenset({'pang'}), 0.8235294117647058), (frozenset({'唐人街'}), frozenset({'陈展鹏'}), 0.8235294117647058), (frozenset({'陈展鹏'}), frozenset({'唐人街'}), 0.8235294117647058), (frozenset({'陳展鵬'}), frozenset({'唐人街'}), 0.8), (frozenset({'官'}), frozenset({'屎'}), 1.0), (frozenset({'小狗'}), frozenset({'狗'}), 0.75), (frozenset({'日记'}), frozenset({'汪星'}), 1.0), (frozenset({'汪星'}), frozenset({'日记'}), 0.96), (frozenset({'星'}), frozenset({'時間'}), 0.75), (frozenset({'時間'}), frozenset({'陳展鵬'}), 0.8333333333333

关联规则：
[(frozenset({'上线'}), frozenset({'mv'}), 0.8695652173913043), (frozenset({'故事'}), frozenset({'mv'}), 1.0), (frozenset({'歌'}), frozenset({'mv'}), 0.7142857142857143), (frozenset({'王冠'}), frozenset({'mv'}), 0.9375), (frozenset({'画面'}), frozenset({'mv'}), 1.0), (frozenset({'mv'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'飞鸟'}), frozenset({'mv'}), 1.0), (frozenset({'上线'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'人物'}), frozenset({'环球'}), 1.0), (frozenset({'环球'}), frozenset({'人物'}), 1.0), (frozenset({'人物'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'公益'}), frozenset({'粉丝'}), 0.7333333333333333), (frozenset({'公益'}), frozenset({'能量'}), 0.8), (frozenset({'能量'}), frozenset({'公益'}), 0.8571428571428571), (frozenset({'公益'}), frozenset({'蔡徐坤'}), 0.9333333333333333), (frozenset({'投票'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'画面'}), frozenset({'故事'}), 0.8), (frozenset({'故事'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'数据'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'文艺'}), frozenset({'蔡徐坤'}), 1.0), (frozens

关联规则：
[(frozenset({'hannah'}), frozenset({'昆凌'}), 0.8571428571428571), (frozenset({'专辑'}), frozenset({'周杰伦'}), 0.75), (frozenset({'电影'}), frozenset({'功夫'}), 1.0), (frozenset({'味'}), frozenset({'周杰伦'}), 1.0), (frozenset({'情人节'}), frozenset({'周杰伦'}), 0.75), (frozenset({'拉斯维加斯'}), frozenset({'周杰伦'}), 0.9), (frozenset({'杰伦'}), frozenset({'周杰伦'}), 0.875), (frozenset({'杰倫'}), frozenset({'周杰伦'}), 1.0), (frozenset({'演唱会'}), frozenset({'周杰伦'}), 0.9166666666666666), (frozenset({'粉丝'}), frozenset({'周杰伦'}), 0.75), (frozenset({'美国'}), frozenset({'周杰伦'}), 0.8333333333333334), (frozenset({'拉斯维加斯'}), frozenset({'演唱会'}), 0.8), (frozenset({'美国'}), frozenset({'拉斯维加斯'}), 0.8333333333333334), (frozenset({'美国'}), frozenset({'演唱会'}), 0.8333333333333334), (frozenset({'赫'}), frozenset({'老板'}), 0.7142857142857143), (frozenset({'老板'}), frozenset({'陈赫'}), 1.0), (frozenset({'老板'}), frozenset({'陪伴'}), 1.0), (frozenset({'赫'}), frozenset({'陈赫'}), 1.0), (frozenset({'赫'}), frozenset({'陪伴'}), 1.0), (frozenset({'陈赫'}), f

关联规则：
[(frozenset({'专场'}), frozenset({'事'}), 0.75), (frozenset({'声入'}), frozenset({'人心'}), 1.0), (frozenset({'人心'}), frozenset({'声入'}), 1.0), (frozenset({'孟鹤堂'}), frozenset({'周九良'}), 1.0), (frozenset({'周九良'}), frozenset({'孟鹤堂'}), 1.0), (frozenset({'李'}), frozenset({'演员'}), 1.0)]
用户5676193931
关联规则：
[(frozenset({'cxk'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'ikun'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'大会'}), frozenset({'oppo'}), 1.0), (frozenset({'oppo'}), frozenset({'大会'}), 1.0), (frozenset({'上线'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'舞台'}), frozenset({'作品'}), 0.8333333333333334), (frozenset({'作品'}), frozenset({'蔡徐坤'}), 0.7272727272727273), (frozenset({'徐'}), frozenset({'坤'}), 1.0), (frozenset({'坤'}), frozenset({'徐'}), 1.0), (frozenset({'蔡'}), frozenset({'坤'}), 0.7777777777777778), (frozenset({'坤'}), frozenset({'蔡'}), 1.0), (frozenset({'坤'}), frozenset({'蔡徐坤'}), 1.0), (frozenset({'小米'}), frozenset({'手机'}), 1.0), (frozenset({'蔡'}), frozenset({'徐'}), 0.7777777777777778), (frozense