In [1]:
import json
import pybrain as pb
from pybrain.datasets.classification import ClassificationDataSet
from pybrain.utilities import percentError
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SoftmaxLayer
import numpy as np

In [2]:
class Pipeline:
    def __init__(self, in_stream):
        self._processors = []
        self._in_stream = in_stream

    def add_processor(self, processor):
        if callable(processor):
            self._processors.append(processor)

    def add_processors(self, processors):
        for processor in processors:
            self.add_processor(processor)

    def process(self):
        pipeline = self._in_stream
        for processor in self._processors:
            pipeline = processor(pipeline)
        return pipeline

In [3]:
def json_parser(lines):
    for line in lines:
        yield json.loads(line)

In [4]:
with open('/home/hadoop/data/nn_ready/label_idx.json', 'r') as fp:
    label_idx = json.load(fp)
    
label_num = len(label_idx)

In [5]:
with open('/home/hadoop/data/nn_ready/word_idx.json', 'r') as fp:
    feature_idx = json.load(fp)
    
feature_num = len(feature_idx)

In [6]:
train_ds = ClassificationDataSet(feature_num, target=label_num)
test_ds = ClassificationDataSet(feature_num, target=label_num)

In [7]:
def to_numpy(objs):
    for obj in objs:
        idxs = list(map(lambda x: x[0], obj['Features']))
        values = list(map(lambda x: x[1], obj['Features']))        

        feature_vec = np.zeros(feature_num, dtype=np.uint8)
        feature_vec[idxs] = values
        
        label_vec = np.zeros(label_num, dtype=np.uint8)
        label_vec[obj['Labels']] = [1]*len(obj['Labels'])
        
        yield feature_vec, label_vec

In [8]:
with open('/home/hadoop/data/nn_ready/sparse_data.json', 'r') as fp:
    print(sum(1 for _ in fp))

102797


In [9]:
with open('/home/hadoop/data/nn_ready/sparse_data.json', 'r') as fp:
    pipe = Pipeline(fp)
    pipe.add_processors([json_parser, to_numpy])
    i = 0
    for feature_vec, labels in pipe.process():
        train_ds.appendLinked(feature_vec, labels)
        i += 1
        if i % 5000 == 0:
            print(i)
        if i == 70000:
            break
    for feature_vec, labels in pipe.process():
        test_ds.appendLinked(feature_vec, labels)
            

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000


In [10]:
test_ds._convertToOneOfMany()
train_ds._convertToOneOfMany()

In [16]:
fnn = buildNetwork(test_ds.indim, 10, test_ds.outdim, outclass=SoftmaxLayer)

In [15]:
del fnn

In [17]:
trainer = BackpropTrainer(fnn, dataset=train_ds, verbose=True)

In [18]:
for i in range(10):
    trainer.trainEpochs(1)
    trnres = percentError(trainer.testOnClassData(), test_ds)
    print(trnres)

Total error:  0.00197431636342
0.0014285714285714286
Total error:  0.00197191554871
0.0014285714285714286
Total error:  0.00197210477803
0.0014285714285714286
Total error:  0.00197213694148
0.0014285714285714286


KeyboardInterrupt: 

In [19]:
trainer.trainUntilConvergence(maxEpochs=10000, verbose=True)

Total error:  0.001974007645
Total error:  0.00197408829047
Total error:  0.00197399187131
Total error:  0.00197392637212
Total error:  0.00197387907445
Total error:  0.00197396459961
Total error:  0.00197394292491
Total error:  0.00197393511444
Total error:  0.00197400503318
Total error:  0.00197387662739
Total error:  0.00197387498083
Total error:  0.00197396045167
Total error:  0.00197402170474
Total error:  0.00197949390856
Total error:  0.00200069531174
Total error:  0.00200099887855
Total error:  0.00200156071928
Total error:  0.0020021836477


KeyboardInterrupt: 