In [1]:
import numpy as np

import scipy as sp
import scipy.sparse
import scipy.sparse.linalg

import sklearn
import sklearn.preprocessing

import matplotlib
import matplotlib.pyplot as plt

import pickle

In [2]:
# twitter dataset

with open("ovchinnikov-rutwitterdataset/A.pkl", "rb") as file:
    A = np.load(file, encoding="latin1")

with open("ovchinnikov-rutwitterdataset/labeled_nodes.pkl", "rb") as file:
    labels = np.load(file, encoding="latin1")

with open("ovchinnikov-rutwitterdataset/i2t.pkl", "rb") as file:
    i2t = pickle.load(file)

with open("ovchinnikov-rutwitterdataset/t2i.pkl", "rb") as file:
    t2i = pickle.load(file)

In [3]:
# preprocessing

F = sklearn.preprocessing.normalize(A, axis=1, norm='l1')
B = sklearn.preprocessing.normalize(A.T, axis=1, norm='l1')
d = np.array(labels.todense())[:, 0]



In [4]:
# we are looking for solution x == T(x)

def T(F, B, d, x, a1=0.7, a2=0.7, a3=0.7):
    return a1 * F.dot(x.clip(0)) + a2 * B.dot(x.clip(-np.inf, 0)) + a3 * d

In [48]:
x_prev = d.astype(np.int8)
x_next = x_prev.copy()

In [49]:
def h(x):
    ans = np.zeros_like(x)
    ans[x > 0] = 1
    return ans

In [50]:
F_csc = F.tocsc()
B_csc = B.tocsc()

In [51]:
def der(x):
    l = x - T(F, B, d, x)
    lF = F_csc.T.dot(l)
    lB = B_csc.T.dot(l)
    return l - np.multiply(lF, h(x)) - np.multiply(lB, h(-x))

In [35]:
for k in range(100):
    dx = der(x_prev)
    for i in range(len(x_next)):
        if d[i] == 0 and dx[i] != 0:
            if x_next[i] < 1 and dx[i] > 0:
                x_next[i] += 1
            if x_next[i] > -1 and dx[i] < 0:
                x_next[i] -= 1
    print(k, np.linalg.norm(x_next - x_prev, ord=1))
    if np.linalg.norm(x_next - x_prev, ord=1) == 0:
        break
    x_prev = x_next.copy()

0 16524.0
1 131289.0
2 102646.0
3 53347.0
4 16466.0
5 7303.0
6 4319.0
7 3338.0
8 2868.0
9 2625.0
10 2589.0
11 2438.0
12 2415.0
13 2389.0
14 2384.0
15 2379.0
16 2379.0
17 2379.0
18 2379.0
19 2379.0
20 2379.0
21 2379.0
22 2379.0


KeyboardInterrupt: 

In [42]:
(x_prev == 0).sum()

20196

In [45]:
np.linalg.norm(ans - T(F, B, d, ans))

5.0822908191887304e-09

In [47]:
np.linalg.norm(der(ans))

4.6968612875355639e-09

In [57]:
for k in range(100):
    x_next = x_prev - der(x_prev)
    x_prev = x_next.copy()
    if np.linalg.norm(der(x_prev)) < 1e-5:
        break
    print(k, np.linalg.norm(der(x_prev)), np.linalg.norm(x_next - T(F, B, d, x_next)))

0 11.7849339698 8.20545185553
1 14.4018524787 15.6038223142
2 64.2822112135 16.0489443913
3 69.9685429838 63.7108830105
4 156.611622573 49.1424491042
5 1120.33359962 296.868823711
6 11681.5271003 2541.41688694
7 91147.1925577 25132.7998683
8 1432782.12907 252412.024622
9 9758317.01383 2981002.24464
10 199618017.421 31811470.854
11 1271245576.61 408321432.198
12 28159631011.9 4424666653.43
13 178722969417.0 58053268061.5
14 4.12988912687e+12 636627682918.0
15 2.5770160934e+13 8.42696747741e+12
16 5.91225056091e+14 9.20057296144e+13
17 3.72715624764e+15 1.21680452304e+15
18 8.64586600918e+16 1.33501442135e+16
19 5.40755255312e+17 1.76902174242e+17
20 1.24768898752e+19 1.9353294567e+18
21 7.84081716154e+19 2.56219422017e+19
22 1.82136645291e+21 2.80954400874e+20


KeyboardInterrupt: 

In [54]:
np.linalg.norm(x_next - T(F, B, d, x_next))

9.5767795408787517

# NLA below!

In [36]:
# original RepRank

def RepRank(F, B, d, maxiter=200, x0=None, tol=1e-8, callback=None):
    if x0 is None:
        x_prev = d.copy()
    else:
        x_prev = x0.copy()

    for k in range(maxiter):
        x_next = T(F, B, d, x_prev)
        n = np.linalg.norm(x_next - x_prev)
        if callback is not None:
            callback(x_next, n)
        if n < tol:
            break
        x_prev = x_next

    ans = x_next.copy()
    return(k + 1, ans.reshape(-1,))

In [37]:
# original RepRank answer

k, ans = RepRank(F, B, d)

In [40]:
ans_ = np.zeros_like(ans)
ans_[ans > 0] = 1
ans_[ans < 0] = -1

In [41]:
np.linalg.norm(ans_ - x_prev, ord=1)

372500.0

In [46]:
k

57