In [1]:
import time

import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import StandardScaler

import networkx as nx
# from networkx.readwrite import json_graph

import pickle as pkl

import os
import json

def load_reddit_data():
    
    G = nx.read_edgelist("../data/reddit/reddit_G.edg", delimiter=" ", data=True)
    nx.set_edge_attributes(G=G, name="weight", values=1)

    feats = np.load("../data/reddit/reddit-feats.npy")
    id_map = json.load(open("../data/reddit/reddit-id_map.json"))
    conversion = lambda n : n
    id_map = {conversion(k):int(v) for k,v in id_map.items()}
    
    G = nx.relabel_nodes(G, id_map, )

    with open("../data/reddit/train_nodes", "rb") as f:
        train_nodes = pkl.load(f)
    with open("../data/reddit/val_nodes", "rb") as f:
        val_nodes = pkl.load(f)
    with open("../data/reddit/test_nodes", "rb") as f:
        test_nodes = pkl.load(f)
        
    train_idx = [id_map[n] for n in train_nodes]
    val_idx = [id_map[n] for n in val_nodes]
    test_idx = [id_map[n] for n in test_nodes]
    
    # normalize by training data
    train_feats = feats[train_idx]
    scaler = StandardScaler()
    scaler.fit(train_feats)
    feats = scaler.transform(feats)
    
    X = feats
    
    class_map = json.load(open("../data/reddit/reddit-class_map.json"))
    if isinstance(list(class_map.values())[0], list):
        lab_conversion = lambda n : n
    else:
        lab_conversion = lambda n : int(n)
    class_map = {conversion(k):lab_conversion(v) for k,v in class_map.items()}
    
    num_classes = max(class_map.values())
    class_map = {id_map[k]: v for k, v in class_map.items()}
    
    Y = csr_matrix(([1] * len(class_map), (class_map.keys(), class_map.values())), shape=(len(class_map), num_classes))
        
    train_G = G.subgraph(train_idx)
    val_G = G.subgraph(train_idx + val_idx)
    test_G = G

    return train_G, val_G, test_G, X, Y, train_idx, val_idx, test_idx 

In [2]:
import pickle as pkl

In [17]:
with open("../positive_samples/cora/positive_samples.pkl", "rb") as f:
    pos_samples = pkl.load(f)

In [20]:
len(pos_samples)

3249600

In [7]:
with open("../positive_samples/cora/positive_samples", "w") as f:
    for pos_sample in pos_samples:
        f.write("{} {} ".format(pos_sample[0], pos_sample[1]))

In [8]:
l = [1, 2, 3, 4, 5, 6]

In [9]:
zip(l[::2], l[1::2])

[(1, 2), (3, 4), (5, 6)]

In [15]:
with open("../positive_samples/cora/positive_samples", "r") as f:
    l = f.readline()
    split = [int(n) for n in l.rstrip().split(" ")]
    positive_samples = zip(split[::2], split[1::2]) 

In [19]:
len(positive_samples)

3249600

In [12]:
with open("../negative_samples/AstroPh/negative_samples", "w") as f:
    for k in neg_samples:
        f.write("{} ".format(k) + " ".join(str(v) for v in neg_samples[k]) + "\n")

In [1]:
10

10

In [4]:
neg_samples = {}
with open("../negative_samples/AstroPh/negative_samples", "r") as f:
    for l in f.readlines()[:1]:
        split = l.split(" ")
        print int(split[-1])
        neg_samples.update({int(split[0]) : [int(n) for n in split[1:]]})

18771


In [5]:
import pickle as pkl

In [6]:
with open("../walks/cora/walks-10-15.pkl", "rb") as f:
    walks = pkl.load(f)

In [8]:
walks[0]

[2205,
 788,
 1952,
 1031,
 1952,
 1031,
 788,
 1952,
 2206,
 48,
 2206,
 48,
 2206,
 1952,
 1951]

In [9]:
l = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

In [10]:
[l[i:i+5] for i in range(0,len(l), 5)]

[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]]

In [5]:
neg_samples[0]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 18

In [6]:
1.9 / 2.6

0.7307692307692307

In [5]:
import pandas as pd

In [12]:
" ".join(neg_samples[0])

TypeError: sequence item 0: expected string, int found

In [8]:
for k in neg_samples:
    print "{} ".format(k) + " ".join(str(v) for v in neg_samples[k])
    print

0 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18 19 20 21 22 23 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 144 145 146 147 148 149 150 151 152 153 154 155 156 159 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 198 199 200 202 203 204 206 207 208 209 210 211 212 213 214 215 217 218 219 220 221 222 223 224 225 226 227 228 229 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 29

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [2]:
G = nx.read_edgelist("../data/wordnet/noun_closure.tsv", delimiter="\t")

In [3]:
len(G)

82115

In [6]:
import pandas as pd

In [39]:
?pd.read_csv

In [35]:
!head ../data/wordnet/hyperlex-all.txt

WORD1 WORD2 POS TYPE AVG_SCORE AVG_SCORE_0_10 STD SCORES..
conflict disagreement N r-hyp-1 5.20 8.67 1.25 5 6 6 6 6 5 2 6 4 6
advance take V no-rel 1.42 2.37 1.66 3 0 3 0 0 5 0 0 1 2 3 0
trail follow V hyp-2 4.31 7.18 1.86 4 4 6 2 6 6 6 3 0 4 3 6 6
mason worker N hyp-3 4.50 7.5 1.76 6 6 4 4 6 3 6 3 5 0 5 6
aura light N hyp-1 3.69 6.15 1.86 4 5 3 4 6 0 3 6 3 0 4 5 5
radish carrot N cohyp 0.09 0.15 0.29 0 0 0 0 0 0 0 0 1 0 0
tusk duty N no-rel 0.08 0.13 0.28 0 0 0 0 0 0 0 0 1 0 0 0
vehicle motorcycle N r-hyp-4 1.09 1.82 1.93 1 0 0 1 0 0 0 0 4 6 0
veal meat N hyp-1 5.86 9.77 0.35 6 6 6 6 5 6 5 6 6 6 6 6 6 6


In [37]:
X = pd.read_csv("../data/wordnet/hyperlex-all.txt", usecols=["WORD1", "WORD2", "AVG_SCORE_0_10"], sep=" ", header=0, )

In [63]:
lines = []
with open("../data/wordnet/hyperlex-all.txt", "r",) as f:
    for line in f.readlines():
        split = line.rstrip().split(" ")
        lines.append([split[0], split[1], split[5]])

In [65]:
headers = lines.pop(0)
X = pd.DataFrame(lines, columns=headers)

In [66]:
X

Unnamed: 0,WORD1,WORD2,AVG_SCORE_0_10
0,conflict,disagreement,8.67
1,advance,take,2.37
2,trail,follow,7.18
3,mason,worker,7.5
4,aura,light,6.15
5,radish,carrot,0.15
6,tusk,duty,0.13
7,vehicle,motorcycle,1.82
8,veal,meat,9.77
9,keyboard,arm,0.9


In [69]:
len(set(G.nodes))

82115

In [9]:
H = nx.read_edgelist("../data/collaboration_networks/ca-GrQc.txt.gz")

In [10]:
len(H)

5242

In [85]:
nodes = {n.split(".")[0]: i for i, n in enumerate(G.nodes()) if n.split(".")[2] == "01"}

In [86]:
nodes

{u'grand_larceny': 47674,
 u'european_creeper': 79054,
 u'rilke': 80780,
 u'homomorphism': 21320,
 u'melosa': 19237,
 u'mustachio': 19234,
 u'colonoscopy': 51686,
 u'utnapishtim': 2336,
 u'long_wave': 76047,
 u'circuitry': 4224,
 u'watt-hour': 65795,
 u'fabianism': 32129,
 u'cheiranthus': 21301,
 u'manganese_steel': 62544,
 u'cardiospermum': 44120,
 u'bladder_disorder': 78902,
 u'ill_nature': 56548,
 u'positive_pole': 39472,
 u'tupaia': 68043,
 u'taconic_mountains': 72858,
 u'scold': 56040,
 u'suicide_pill': 51425,
 u'cycling': 51364,
 u'originality': 72353,
 u'ringworm_bush': 12558,
 u'trial_and_error': 58851,
 u'pretermission': 55135,
 u'tailstock': 18764,
 u'range_pole': 12561,
 u'crossbar': 1870,
 u'lynch_law': 8,
 u'jagannath': 10975,
 u'porphyrio': 8008,
 u'peacekeeping': 44162,
 u'rosidae': 12415,
 u'cytochrome': 25482,
 u'bioclimatology': 77598,
 u'heliothis': 35428,
 u'lappland': 54829,
 u'mecopteran': 28264,
 u'fourteenth_amendment': 10543,
 u'bratislava': 5775,
 u'bregma': 3

In [91]:
id_ranks = []
for i, (w1, w2, score) in X.iterrows():
    if w1 in nodes and w2 in nodes:
        id_ranks.append([nodes[w1], nodes[w2], score])
        print nodes[w1], nodes[w2], score

73545 15798 8.67
25684 44156 7.5
59527 51195 6.15
33201 44338 0.15
78057 54618 1.82
50407 24571 9.77
12119 39102 0.9
51282 27633 3.05
55527 5438 0.0
75079 60852 6.5
38091 31013 3.2
301 54189 1.67
74221 32407 0.0
48984 70881 0.33
26621 21355 0.52
31986 38431 7.97
22193 25585 9.87
39626 44409 8.83
57640 25585 9.83
79924 19454 0.25
64120 2881 7.73
62659 80390 5.77
70951 29785 2.82
55715 3166 6.53
3179 7683 4.28
2465 42692 9.17
69800 48941 4.48
72420 14364 0.52
81063 11954 2.73
13571 28001 8.83
48376 80298 9.72
51043 46300 3.93
38563 16336 1.37
72743 49519 2.0
44156 11655 1.67
20458 81063 9.5
7862 1167 9.55
1167 76627 7.78
5707 25585 10.0
21991 27319 8.33
28690 39656 8.93
63359 58355 7.95
25887 44156 9.1
41059 66122 2.73
19685 13034 0.55
47902 69814 6.15
67086 11496 2.73
81302 49015 10.0
52843 3241 9.83
35607 59799 5.33
35997 31188 4.0
64997 25585 8.75
73311 19802 6.67
12119 76863 7.08
81063 34698 1.28
32201 76988 5.67
38864 31843 0.5
3120 39782 1.52
54196 8881 0.97
24539 46630 8.88
41500 

50673 46555 5.42
77717 25585 10.0
40081 72964 6.67
33952 60929 4.03
13280 45718 9.75
23725 44779 5.7
64609 21202 7.42
19286 46128 9.72
38210 41085 4.17
35904 36379 2.0
28690 57692 0.83
59349 34712 0.6
38378 25585 10.0
76317 39742 0.9
62891 38768 9.5
12691 65456 0.7
81507 23983 5.7
12862 56923 8.85
20519 38955 4.62
7499 50879 6.28
15421 47653 3.85
53250 40258 2.37
37537 70111 8.08
78057 72165 0.95
8800 33282 5.63
8329 81103 6.12
73545 35160 6.12
4985 5168 0.6
71310 53004 9.87
12594 56547 0.13
48676 56408 1.67
67382 52994 0.67
4536 61444 0.6
33498 33398 0.25
32608 52292 0.6
76088 52873 9.33
22492 49124 4.75
34627 81946 0.67
3871 23983 0.3
72290 42038 2.57
62906 31881 9.67
15089 52221 8.93
25811 23031 1.67
23494 45372 1.38
22320 46772 6.97
15254 13250 1.33
70399 13318 7.78
11952 42818 9.45
44142 39782 4.55
47154 44900 7.63
14723 81284 8.83
8964 47640 6.52
29059 54042 7.3
48927 26537 9.37
14471 48938 5.73
50384 13250 0.25
47869 11550 10.0
16208 12523 9.85
48147 70000 3.33
81284 24839 1.42


In [96]:
Y = pd.DataFrame(id_ranks, columns=["WORD1", "WORD2", "AVG_SCORE_0_10"])

In [98]:
Y.to_csv("../data/wordnet/hyperlex_idx_ranks.txt", sep=" ")

In [100]:
Y = pd.read_csv("../data/wordnet/hyperlex_idx_ranks.txt", sep=" ", index_col=0)

In [101]:
Y

Unnamed: 0,WORD1,WORD2,AVG_SCORE_0_10
0,73545,15798,8.67
1,25684,44156,7.50
2,59527,51195,6.15
3,33201,44338,0.15
4,78057,54618,1.82
5,50407,24571,9.77
6,12119,39102,0.90
7,51282,27633,3.05
8,55527,5438,0.00
9,75079,60852,6.50


In [4]:
list(G.nodes())[15798]

u'disagreement.n.01'

In [90]:
!head ../data/wordnet/hyperlex_idx_ranks.txt

 WORD1 WORD2 AVG_SCORE_0_10
0 23087.0 29089.0 8.67
1 218.0 19466.0 7.5
2 22444.0 28214.0 6.15
3 29270.0 18622.0 0.15
4 22086.0 27245.0 0.13
5 32821.0 30562.0 1.82
6 20154.0 24027.0 9.77
7 14394.0 37449.0 0.9
8 25724.0 7651.0 0.0


In [88]:
list(G.nodes())[73545]

u'conflict.n.01'

In [89]:
list(G.nodes())[15798]

u'disagreement.n.01'

In [80]:
nodes[72741]

u'eye'

In [81]:
nodes[72742]

u'eye'

In [72]:
set([n.split(".")[0] for n in G.nodes])

{u'fawn',
 u'european_creeper',
 u'rilke',
 u'homomorphism',
 u'melosa',
 u'mustachio',
 u'colonoscopy',
 u'utnapishtim',
 u'long_wave',
 u'circuitry',
 u'watt-hour',
 u'fabianism',
 u'cheiranthus',
 u'manganese_steel',
 u'cardiospermum',
 u'bladder_disorder',
 u'ill_nature',
 u'coarctation',
 u'positive_pole',
 u'chatter',
 u'taconic_mountains',
 u'scold',
 u'suicide_pill',
 u'cycling',
 u'originality',
 u'ringworm_bush',
 u'trial_and_error',
 u'pretermission',
 u'tailstock',
 u'range_pole',
 u'crossbar',
 u'lynch_law',
 u'jagannath',
 u'porphyrio',
 u'peacekeeping',
 u'rosidae',
 u'cytochrome',
 u'bioclimatology',
 u'heliothis',
 u'lappland',
 u'mecopteran',
 u'fourteenth_amendment',
 u'bratislava',
 u'bregma',
 u'appropriation',
 u'rawhide',
 u'quotability',
 u'ploceidae',
 u'fractal',
 u'foredeck',
 u'fibrinase',
 u'lithiasis',
 u'simple_interest',
 u'palatal',
 u'broiler',
 u'green_bay',
 u'razor_edge',
 u'herring_salad',
 u'wednesday',
 u'phone-in',
 u'decoction_process',
 u'iron

In [67]:
X.shape

(2616, 3)

In [7]:
row_idx = [1,2,3]
col_idx = [4,5,6]

data = [1]*3

Y  =csr_matrix((data, (row_idx, col_idx)), shape=(10, 10))

In [10]:
a = np.arange(10)

In [11]:
a.flatten()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
Y[[1,3,5]].argmax(axis=1).A1

array([4, 6, 0])

In [13]:
list(np.array([1,2.3]))

[1.0, 2.3]

In [None]:
train_G, val_G, test_G, X, Y, train_idx, val_idx, test_idx = load_reddit_data()