In [1]:
import numpy as np
from sklearn.svm import LinearSVC 

In [2]:
def loadData(dir="./data/", university="cornell"):
    fcontent = open(dir+"content/"+university+".content", "r")
    content = {}
    labels = set()
    for l in fcontent.readlines():
        attributes = l.split('\t')
        label = attributes[-1][:-1]
        labels.add(label)
        content[attributes[0]] = {}
        content[attributes[0]]['attributes'] = attributes[1:-1]
        content[attributes[0]]['label'] = label
        content[attributes[0]]['successors'] = set()
        content[attributes[0]]['predecessors'] = set()
        content[attributes[0]]['neighbours'] = set()
        content[attributes[0]]['unknown'] = False
    fcontent.close()
    fcites = open(dir+"cites/"+university+".cites", "r")
    for l in fcites.readlines():
        sites = l[:-1].split(' ')
        content[sites[0]]['successors'].add(sites[1])
        content[sites[1]]['predecessors'].add(sites[0])
        content[sites[0]]['neighbours'].add(sites[1])
        content[sites[1]]['neighbours'].add(sites[0])
    return content, list(labels)

In [9]:
def vectorize(graph, unknownList, classes, pooling='sum', direction='successors'):
    nNodes = len(graph.keys())
    nAtt = len(graph[graph.keys()[0]]['attributes'])
    nClass = len(classes)
    nUnknown = len(unknownList)
    representation = {}
    representation['unknown'] = np.zeros((nUnknown, nAtt + nClass))
    representation['known'] = np.zeros((nNodes - nUnknown, nAtt + nClass))
    labels = {}
    labels['unknown'] = np.zeros(nUnknown)
    labels['known'] = np.zeros(nNodes - nUnknown)
    i = {}
    i['unknown'] = 0
    i['known'] = 0
    for node in graph:
        if (node in unknownList):
            k = 'unknown'
        else:
            k = 'known'
        print "1", representation[k]
        print i[k]
        print "2", representation[k][i[k]]
        print "3", representation[k][i[k],:nAtt]
        representation[k][i[k],:nAtt] = graph[node]['attributes']
        labels[k][i[k]] = classes.index(graph[node]['label'])
        nKnownSuccessors = 0
        for s in graph[node][direction]:
            if (not graph[s] in unknownList):
                nKnownSuccessors += 1
                labelIndex = classes.index(graph[s]['label'])
                if (pooling == 'sum' or pooling == 'avg'):
                    representation[k][i[k],nAtt + labelIndex] += 1
                if (pooling == 'max'):
                    representation[k][i[k],nAtt + labelIndex] = 1
        if (pooling == 'avg' and nKnownSuccessors > 0):
            representation[k][i[k],nAtt:] /= nKnownSuccessors
        i[k] += 1
    return representation, labels




In [11]:
graph, classes = loadData()
pUnknown = .10

unknown = np.random.choice(graph.keys(), int(len(graph.keys()) * pUnknown))

print len(unknown)

data, label = vectorize(graph, unknown, classes, 'sum')

19
1 [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
0
2 [ 0.  0.  0. ...,  0.  0.  0.]
3 [ 0.  0.  0. ...,  0.  0.  0.]
1 [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
1
2 [ 0.  0.  0. ...,  0.  0.  0.]
3 [ 0.  0.  0. ...,  0.  0.  0.]
1 [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
2
2 [ 0.  0.  0. ...,  0.  0.  0.]
3 [ 0.  0.  0. ...,  0.  0.  0.]
1 [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,

IndexError: index 176 is out of bounds for axis 0 with size 176

In [14]:
 len(graph.keys())

195

In [None]:
label

In [None]:
clf = LinearSVC(C=1)
clf.fit(data['known'], label['known'])

In [None]:
clf.score(data['unknown'], label['unknown'])