In [1]:
import pandas as pd
import numpy as np
import torch as pt
import tqdm

from ml_lib.clusters.root_cluster import RootCluster as Root
from ml_lib.clusters.data_cluster import DataCluster as Data
from ml_lib.clusters.dense_cluster import DenseCluster as Dense

from ml_lib.utils.loss_collapser import Linear

#pt.set_default_tensor_type('torch.FloatTensor')
pt.set_default_tensor_type('torch.cuda.FloatTensor')

In [6]:
class Controller():
    def __init__(self,
                 collapser = Linear
                ):
        self.clusters = {}
        self.Collapser = collapser()
        
    def add_cluster(self, cluster):
        if cluster.name in self.clusters.keys():
            raise Exception('Cluster names %s already in model' % cluster.name)
            
        self.clusters[cluster.name] = cluster
        
    def link_clusters(self, from_cluster_name, to_cluster_name, **kwargs):
        self.clusters[from_cluster_name].add_link(self.clusters[to_cluster_name], 'output', **kwargs)
        self.clusters[to_cluster_name].add_link(self.clusters[from_cluster_name], 'input', **kwargs)
        
    def add_link(self, cluster, link_cluster_name, link_type, **kwargs):
        self.add_cluster(cluster)
        if link_type == 'output':
            self.link_clusters(cluster.name, link_cluster_name, **kwargs)
        elif link_type == 'input':
            self.link_clusters(link_cluster_name, cluster.name, **kwargs)
        else:
            raise Exception('%s is not a valid link type, use "input" or "output"' % link_type)
        
    def init_clusters(self, reinit = False):
        for cluster in self.clusters.values(): cluster.init_cluster(reinit = reinit)
            
    def train_model(self, iters):
        t = tqdm.tnrange(iters)
        for epoc in t:
            self.learning_iter(t)
            
    def learning_iter(self, t):
        loss = self.get_loss()
        for cluster in self.clusters.values():
            cluster.learn(loss)
        t.set_postfix({'loss': loss.detach().cpu().numpy()})
            
    def get_loss(self):
        self.prime_clusters()
        losses = [cluster.get_loss() for cluster in self.clusters.values()]
        self.deprime_clusters()
        loss = self.Collapser.collapse(pt.stack([
            self.Collapser.collapse(loss_val)
            for loss_val in losses
            if loss_val is not None
        ]))
        return loss
            
    def prime_clusters(self, reprime = False):
        for cluster in self.clusters.values(): cluster.prime_cluster(reprime = reprime)
            
    def deprime_clusters(self):
        for cluster in self.clusters.values(): cluster.deprime_cluster()

In [7]:
dataset = pd.read_csv(
    'data_files/kc_house_data.csv',
    index_col = 'id', parse_dates = ['date'], date_parser = lambda x: pd.datetime.strptime(x, '%Y%m%dT%H%M%S')
)
dataset = dataset[[col for col in dataset.columns if not col.endswith('15')]]

In [12]:
control = Controller()
control.add_cluster(
    Data('data_cluster', dataset[['price', 'bedrooms', 'bathrooms', 'sqft_living']])
)
control.add_link(
    Dense('reg_cluster', 1, learner_params = {'learn_rate': 1e-1}),
    'data_cluster', 'input', data_cols = ['sqft_living', 'bedrooms', 'bathrooms']
)
control.link_clusters('reg_cluster', 'data_cluster', data_cols = ['price'])
control.init_clusters()

In [13]:
control.train_model(10000)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [14]:
control.clusters['reg_cluster'].coefs

{'weights': tensor([[ 263.3921],
         [-184.4671],
         [ -25.5456]]), 'bias': tensor([-26.8924])}