In [5]:
import numpy as np
import scipy as sp
import random
import scipy.sparse
import scipy.sparse.linalg
import pickle
import sklearn
import sklearn.preprocessing
import webbrowser

In [28]:
# twitter

with open("ovchinnikov-rutwitterdataset/A.pkl", "rb") as file:
    A = np.load(file, encoding="latin1")

with open("ovchinnikov-rutwitterdataset/labeled_nodes.pkl", "rb") as file:
    d = np.load(file, encoding="latin1")

with open("ovchinnikov-rutwitterdataset/i2t.pkl", "rb") as file:
    i2t = pickle.load(file)

with open("ovchinnikov-rutwitterdataset/t2i.pkl", "rb") as file:
    t2i = pickle.load(file)
    
# preprocessing

F = sklearn.preprocessing.normalize(A, axis=1, norm='l1')
B = sklearn.preprocessing.normalize(A.T, axis=1, norm='l1')
d = d.todense()


# substracting 80% of values (20% for cross-validation)
n_n_elements = d[d < 0].shape[1]
n_p_elements = d[d > 0].shape[1]

print("Not spam nodes:\t{}".format(n_n_elements))
print("Spam nodes:\t{}".format(n_p_elements))

n_indices = random.sample(np.where(d < 0)[0].tolist(), int(n_n_elements * .20 // 1))  # Random indices of 20% negative
p_indices = random.sample(np.where(d > 0)[0].tolist(), int(n_p_elements * .20 // 1))  # Random indices of 20% positive
d[n_indices] = 0
d[p_indices] = 0

print("Not spam nodes (80%):\t{}".format(d[d < 0].shape[1]))
print("Spam nodes (80%):\t{}".format(d[d > 0].shape[1]))

Not spam nodes:	2749
Spam nodes:	375
Not spam nodes (80%):	2475
Spam nodes (80%):	338




In [29]:
# we are looking for solution x == T(x)

def T(F, B, d, x, a1=0.7, a2=0.7, a3=0.7):
    return a1 * F.dot(x.clip(0)) + a2 * B.dot(x.clip(-np.inf, 0)) + a3 * d

In [32]:
# original RepRank

def repRank(F, B, d, maxiter=200, x0=None, tol=1e-8, cross_validator=None):
    if x0 is None:
        x_prev = d.copy()
    else:
        x_prev = x0.copy()

    for k in range(maxiter):
        x_next = T(F, B, d, x_prev)
        if np.linalg.norm(x_next - x_prev) < tol:
            break
        
        # cross-validation
        if cross_validator:
            val_err = (sum(x_prev[p_indices] > 0) + sum(x_prev[n_indices] < 0)) / (len(p_indices) + len(n_indices))
            print("c-val: {}\tnorm: {}".format(float(val_err),np.linalg.norm(x_next - x_prev)))
            
        x_prev = x_next

    ans = x_next.copy()
    return(k + 1, ans)

In [33]:
k, ans = repRank(F, B, d, cross_validator=True)

c-val: 0.0	norm: 29.688359202867556
c-val: 0.6559485530546624	norm: 13.620989896495868
c-val: 0.864951768488746	norm: 7.64479775052603
c-val: 0.9067524115755627	norm: 4.933625413250731
c-val: 0.8938906752411575	norm: 3.0672862312070075
c-val: 0.8938906752411575	norm: 2.07552925319266
c-val: 0.8971061093247589	norm: 1.3290035753398364
c-val: 0.8971061093247589	norm: 0.9034091889314265
c-val: 0.9003215434083601	norm: 0.5856189287698789
c-val: 0.9003215434083601	norm: 0.3980555321736085
c-val: 0.9003215434083601	norm: 0.260538139496578
c-val: 0.9003215434083601	norm: 0.17699409755777268
c-val: 0.9003215434083601	norm: 0.11678005232184453
c-val: 0.9003215434083601	norm: 0.079305409318267
c-val: 0.9003215434083601	norm: 0.05267894479263213
c-val: 0.9003215434083601	norm: 0.03577446830958776
c-val: 0.9003215434083601	norm: 0.0238983515676597
c-val: 0.9003215434083601	norm: 0.016235423278195365
c-val: 0.9003215434083601	norm: 0.010897729827797067
c-val: 0.9003215434083601	norm: 0.007408328544

In [16]:
print(k)

for i in list(map(i2t.get, np.array(ans.argsort(axis=0)[::-1][:3])[:, 0].tolist())):
    webbrowser.open_new_tab("https://twitter.com/intent/user?user_id={}".format(str(i)))

for i in list(map(i2t.get, np.array(ans.argsort(axis=0)[::-1][-3:])[:, 0].tolist())):
    webbrowser.open_new_tab("https://twitter.com/intent/user?user_id={}".format(str(i)))

57
