In [100]:
import numpy as np
import scipy as sp
import random
import scipy.sparse
import scipy.sparse.linalg
import pickle
import sklearn
import sklearn.preprocessing
import webbrowser

In [234]:
# twitter

with open("ovchinnikov-rutwitterdataset/A.pkl", "rb") as file:
    A = np.load(file, encoding="latin1")

with open("ovchinnikov-rutwitterdataset/labeled_nodes.pkl", "rb") as file:
    d = np.load(file, encoding="latin1")

with open("ovchinnikov-rutwitterdataset/i2t.pkl", "rb") as file:
    i2t = pickle.load(file)

with open("ovchinnikov-rutwitterdataset/t2i.pkl", "rb") as file:
    t2i = pickle.load(file)
    
# preprocessing

F = sklearn.preprocessing.normalize(A, axis=1, norm='l1')
B = sklearn.preprocessing.normalize(A.T, axis=1, norm='l1')
d = d.todense()


# substracting 80% of values (20% for cross-validation)
n_n_elements = d[d < 0].shape[1]
n_p_elements = d[d > 0].shape[1]

print("Not spam nodes:\t{}".format(n_n_elements))
print("Spam nodes:\t{}".format(n_p_elements))

n_indices = random.sample(np.where(d < 0)[0].tolist(), int(n_n_elements * .20 // 1))  # Random indices of 20% negative
p_indices = random.sample(np.where(d > 0)[0].tolist(), int(n_p_elements * .20 // 1))  # Random indices of 20% positive
d[n_indices] = 0
d[p_indices] = 0

print("Not spam nodes (80%):\t{}".format(d[d < 0].shape[1]))
print("Spam nodes (80%):\t{}".format(d[d > 0].shape[1]))

Not spam nodes:	2749
Spam nodes:	375
Not spam nodes (80%):	2200
Spam nodes (80%):	300




In [13]:
# we are looking for solution x == T(x)

def T(F, B, d, x, a1=0.7, a2=0.7, a3=0.7):
    return a1 * F.dot(x.clip(0)) + a2 * B.dot(x.clip(-np.inf, 0)) + a3 * d

In [280]:
# original RepRank

def repRank(F, B, d, maxiter=200, x0=None, tol=1e-8, cross_validator=None):
    if x0 is None:
        x_prev = d.copy()
    else:
        x_prev = x0.copy()

    for k in range(maxiter):
        x_next = T(F, B, d, x_prev)
        if np.linalg.norm(x_next - x_prev) < tol:
            break
        x_prev = x_next
        
        # cross-validation
        if cross_validator:
            val_err = (sum(x_prev[p_indices] > 0) + sum(x_prev[n_indices] < 0)) / (len(p_indices) + len(n_indices))
            print(float(val_err))

    ans = x_next.copy()
    return(k + 1, ans)

In [281]:
k, ans = repRank(F, B, d, cross_validator=True)

0.6746794871794872
0.8541666666666666
0.8862179487179487
0.8894230769230769
0.8878205128205128
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769230769
0.8894230769

In [16]:
print(k)

for i in list(map(i2t.get, np.array(ans.argsort(axis=0)[::-1][:3])[:, 0].tolist())):
    webbrowser.open_new_tab("https://twitter.com/intent/user?user_id={}".format(str(i)))

for i in list(map(i2t.get, np.array(ans.argsort(axis=0)[::-1][-3:])[:, 0].tolist())):
    webbrowser.open_new_tab("https://twitter.com/intent/user?user_id={}".format(str(i)))

57


In [None]:
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)