In [67]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from pathlib import Path
import pandas as pd
import networkx as nx
from bidict import bidict
import types
%matplotlib inline

In [2]:
pathContent = Path("./WebKB/content/")
pathCites = Path("./WebKB/cites/")
contentPath = {pC.stem : pC for pC in pathContent.glob("*.content")}
dataPath = {k : (v, (pathCites / (k+'.cites'))) for k,v in contentPath.items() if (pathCites / (k+'.cites')).is_file()}

In [3]:
def readContent(path):
    return pd.read_csv(path, header=None, delim_whitespace=True).rename(columns={0:"node",1704:"class"})
def readCites(path):
    return pd.read_csv(path, names=["source",'target'], delim_whitespace=True)

In [4]:
data = {k : (readContent(vc), nx.DiGraph(readCites(vci))) for k,(vc,vci) in dataPath.items()}

In [68]:
def trainer(self, x, y, ite = 500, batchSize = 100):
    ml = torch.nn.CrossEntropyLoss()
    opt = optim.SGD(self.parameters(), lr=1e-3)
    
    for i in range(ite):
        perm = torch.randperm(x.size()[0])
        
        x = autograd.Variable(x[perm[:batchSize]])
        y = autograd.Variable(y[perm[:batchSize]])
        f = self.forward(x)
        loss = ml.forward(f, y)
        loss.backward()
        opt.step()
    
        if histo :
            histo[0].append(loss.data.mean())
            ypred = torch.max(f, 1)[1]
            histo[1].append(torch.eq(ypred.data, y).float().mean())

def triLayerNN(inn, out):
    r = nn.Sequential(nn.Linear(inn,inn), nn.Linear(inn,inn), nn.Linear(inn,out))
    r.train = types.MethodType(trainer, r)
    return r


In [30]:
def createBiDictLabelToIndices(labels):
    r = bidict()
    for i, l  in enumerate(labels):
        r[l] = i
    return r

class LabelToOneHot(object):
    def __init__(self, labels):
        self.d = createBiDictLabelToIndices(labels)
    
    def toOneHot(self, labels):
        '''
            :param labels: the label you want to be transform into onehot
            :labels type: a list a label
            :rtype: a tensor len(labels)*numberOfAllLabels
        '''
        r = torch.zeros(len(labels), len(self.d))
        ind = torch.LongTensor([[self.d[l] for l in labels]]).t()
        return r.scatter_(1, ind, 1.)
    
    def fromOneHot(self, tensor):
        return [self.d.inv[i] for i in tensor.max(1)[1]]

In [38]:
d = LabelToOneHot({'test', 'lol', 'mdr', 'xd'})
t = d.toOneHot(['test', 'lol','mdr', 'lol','xd', 'lol','test', 'xd'])
print(t,d.fromOneHot(t))
t[[1,2,1,6]]


    1     0     0     0
    0     1     0     0
    0     0     1     0
    0     1     0     0
    0     0     0     1
    0     1     0     0
    1     0     0     0
    0     0     0     1
[torch.FloatTensor of size 8x4]
 ['test', 'lol', 'mdr', 'lol', 'xd', 'lol', 'test', 'xd']



 0  1  0  0
 0  0  1  0
 0  1  0  0
 1  0  0  0
[torch.FloatTensor of size 4x4]

In [83]:
class ICA(object):
    def __init__(data, graph, model, agg):
        '''
            :param data: the data to train the model
            :data type: panda data frame with at least a column "node" and "class", the rest are feature column
            unknown label class in data are None
            :param graph: the graph 
            :graph type: networkx graph of node->node
            :param model: a model that implement a train function
        '''
        self.data = data
        self.graph = graph
        self.model = model
        self.agg = agg
        self.embeder = LabelToOneHot(pd.unique(data[data["class"] != None]["class"]))
    
    def train(self):
        dt = self.data[data["class"].notna()]
        x = dt.drop(["node","class"])
        y = self.embeder.toOneHot(dt["class"])
        self.model.train(x,y)
        

In [70]:
m = triLayerNN(10,5)
m.train()

TypeError: trainer() missing 2 required positional arguments: 'x' and 'y'

In [79]:
d = data['cornell'][0]

In [86]:
pd.unique(d[d["class"] != None]["class"])

array(['student', 'project', 'course', 'staff', 'faculty'], dtype=object)

In [102]:
ind = d.sample(frac = 0.2).index
d.loc[ind, "class"] = None