# simrank原生迭代算法

In [1]:
import numpy as np
import time

## 数据预处理

In [2]:
def getGraph(edge_num):
    with open('test/web-Stanford.txt') as log_fp:
        logs = []
        for i in range(edge_num):
            log = log_fp.readline()
            logs.append(log.strip())    #strip是去掉空格和换行
    
    #from to元祖
    logs_tuple = [ tuple(log.split('\t')) for log in logs ]
    
    #获取from集合和to集合
    from_ = list(set([ log[0] for log in logs_tuple ]))
    to_ = list(set( [log[1] for log in logs_tuple ]))
    
    #顶点集合
    vertex = list(set(from_ + to_))
    
    #图结构邻接矩阵表示
    #图的所有节点都放进去
    #即A矩阵
    graph = np.matrix(np.zeros([len(vertex), len(vertex)]))

    #给图邻接矩阵添加边
    #有向图，非对称
    for log in logs_tuple:
        i = vertex.index(log[0])
        j = vertex.index(log[1])
        graph[i, j] = 1   #行为出度，列为入度
        
    return graph, vertex

## simrank

In [46]:
graph, vertex = getGraph(200)

In [47]:
print(len(vertex))
graph

173


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
#初始化相似度迭代矩阵
sim_mat = np.matrix(np.identity(len(vertex)))
sim_mat

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]])

In [18]:
#计算出度和
def getOutgoingSum(a):
    return graph[a].sum()

#计算入度和
def getInputSum(a):
    return graph[:, a].sum()

#获取a的入度节点
def getInputNode(a):
    index = []
    mat_col = graph[:, a]
    for j in range(len(vertex)):
        if mat_col[j] != 0:
            index.append(j)
    return index

#获取a的出度节点
def getOutgoingNode(a):
    index = []
    for i in range(len(vertex)):
        if graph[a, i] != 0:
            index.append(i)
    return index

In [8]:
# 两个节点相似度计算
def note_sim(a, b, C):
    if a == b:
        return 1
    Ia = getInputSum(a)
    Ib = getInputSum(b)
    if Ia == 0 or Ib == 0:
        return 0
    prefix = C / (Ia * Ib)
    postfix = 0
    for ai in getInputNode(a):
        for bj in getInputNode(b):
            postfix += sim_mat[ai, bj]
    return prefix * postfix

In [16]:
# simrank算法
def simrank(C = 0.8, times = 5):
    global sim_mat
    
    start = time.process_time()
    for run in range(times):
        new_note_sim = np.matrix(np.identity(len(vertex)))
        for ai in range(len(vertex)):
            for bj in range(len(vertex)):
                new_note_sim[ai, bj] = note_sim(ai, bj, C)
        sim_mat = new_note_sim
    end = time.process_time()
    print(end-start)

In [None]:
simrank()
sim_mat