Design idea:

A graph can represent the pipeline. 

1.  Build the pipeline naively
2.  Rearrange the order of the pipeline by the distance to `root` node

In [22]:
import numpy as np
import pandas as pd

from hinge import Hinge, error_on_split
from interaction import Interaction

from sklearn import datasets
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

# using svm as per scikit-feature repo
from sklearn.decomposition import FactorAnalysis # we shall use factor analysis to fix the size of final modelling ds.
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# for sampling
import random # use random.choice? and random.sample for interactions

# use networkx to keep track of changes to our data, so we can recreate things...
import networkx as nx
import matplotlib.pyplot as plt
from graph_utils import *
import itertools
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification

problem_setup = {
    'n_samples': 1000, 
    'n_features': 20, 
    'n_informative': 8, 
    'n_redundant': 8, 
    'n_classes': 3, 
    'random_state': 0    
}

X, y = make_classification(**problem_setup)
X = X.astype(float)
n_samples, n_features = X.shape    # number of samples and number of features

In [3]:
import pandas as pd # for making sure and exploring created datasets...
X_df = pd.DataFrame(X)
base_names = ["base_{}".format(x) for x in range(n_features)]
X_df.columns = base_names[:]
X_df.head()

Unnamed: 0,base_0,base_1,base_2,base_3,base_4,base_5,base_6,base_7,base_8,base_9,base_10,base_11,base_12,base_13,base_14,base_15,base_16,base_17,base_18,base_19
0,-1.561416,-0.302123,-1.626526,-0.808229,0.415516,1.60608,1.471883,-1.677559,5.249528,1.048403,-2.459287,-2.809152,-1.315362,-2.930697,0.120546,-0.759469,-2.690498,1.952268,6.930261,0.428036
1,-0.745867,-1.085664,-1.424702,1.785155,0.155847,-2.346231,-0.789581,2.918363,0.139034,-1.310294,0.02839,3.175972,-1.645095,1.452931,1.912759,-2.638561,-0.175355,1.094468,-3.710453,-1.011041
2,2.209442,-0.210728,0.237275,0.908409,0.03954,1.349943,2.685406,-3.007716,-3.675619,1.099558,-1.161589,-2.289721,2.003953,0.206058,-3.070732,1.392493,0.658975,-2.729099,1.860675,-1.611407
3,-0.390274,-1.501618,0.562233,-0.60103,-5.216131,-0.682441,-4.833625,10.759309,-2.81049,0.824669,4.931424,6.417655,-0.18994,1.523022,2.051892,-7.072514,2.589547,-2.276084,-10.504724,-0.750008
4,-1.523687,-2.82278,-0.941431,-3.189133,0.479385,1.655941,-0.655178,5.341404,4.316366,1.35317,0.343184,2.012068,0.96228,-1.108046,4.737343,-2.803268,1.014875,3.380321,1.554379,1.937787


Feature Search
=======

We have 4 possible situations:

1. Grow - split : assumption is we always grow 2
2. Grow - interaction
3. Remove - split : assumption is we always destroy both
4. Remove - interaction

Proposal Distributions
------------------------

Assume each action has equal probability and is uniform.

Then each proposal is simply:

$$P(x^*|x^{(t)}) = \frac{1}{\text{num possible actions}}$$


------

In this setting there is no need for RJMCMC (yet) as the transformations are parameter free.

What this means is we can do a straight MH

In [53]:
"""
Graph details...

hinge parents will have attribute name hinge_children with value equal to the node name of the children
interaction parents will have attribute name interaction_children with value equal to list of all nodes that are children
"""

# generate the base graph, where we have a the root node and all the base features...
G=nx.DiGraph()
G.add_node("root")

for col in base_names:
    # we can add node attributes, eg
    #G.add_node(col, attribute='here')
    G.add_node(col)
    G.add_edge("root", col)

# add attribute to node base_0
#G = set_graph_node_attributes(G, 'base_0', {'hinge_children': ['a', 'b']})




In [54]:
def get_all_nodes_by_attribute(G, attri_key):
    g_nodes_attr = graph_to_dict(G)['nodes']
    g_nodes = [node for node, attr in g_nodes_attr if attri_key in attr.keys()]
    return g_nodes

def get_graph_attributes_by_key(G, attri_key):
    g_nodes_attr = graph_to_dict(G)['nodes']
    attr_values = [attri.get(attri_key, None) for _, attri in g_nodes_attr]
    attr_values = [x for x in attr_values if x is not None]
    return attr_values

In [55]:
def set_graph_node_attributes(G1, node, attri_dict):
    """
    G is a networkx graph...
    """
    G = G1.copy()
    
    #dict_items = attri_dict.items()    
    for attri_name, attri_value in attri_dict.items():
        nx.set_node_attributes(G, attri_name, {node: attri_value})
    return G.copy()

In [56]:
"""
Migrate this to another module in the future

Details
-------

### hinge

**parent node**

Will have details of the children in "hinge_children". 
That is...

```
Hinge(mask=[child1, child2])
```

### interaction

**child node**

Will have details of the parent in "interaction_parent"
That is...

```
Interaction(['interaction1', 'interaction2'])
```
"""

def get_remove_hinge_candidates(G):
    # get all nodes with attribute "hinge_children"
    # where their totol children is 2 or less.    
    hinge_parents = get_all_nodes_by_attribute(G, 'hinge_children')
    
    # now check that all hinge parents have 2 or less children...
    hinge_parents = [parent for parent in hinge_parents if len(nx.descendants(G, parent)) <= 2]
    return hinge_parents

def get_grow_hinge_candidates(G):
    # these are all nodes who do not have a hinge children attribute. 
    hinge_parents = get_all_nodes_by_attribute(G, 'hinge_children')
    hinge_candidates = [node for node in G.nodes() if node not in hinge_parents]
    return hinge_candidates

def get_remove_interact_candidates(G):
    # get all nodes with attribute "interaction_parent"
    # where they have no children
    interaction_nodes = get_all_nodes_by_attribute(G, 'interaction_parent')
    # now check that all hinge parents have 2 or less children...
    nodes = [node for node in interaction_nodes if len(nx.descendants(G, node)) == 0]
    return nodes
    
def get_grow_interact_candidates(G):
    # get all pairwise nodes which are not currently a pair?
    all_nodes = G.nodes()
    
    # generate all pairs...
    pairwise = itertools.combinations(all_nodes, 2)
    itself = zip(*[all_nodes, all_nodes])
    all_pairs = list(pairwise) + list(itself)
    
    seen_pairs = get_graph_attributes_by_key(G, 'hinge_children')
    seen_pairs = [set([x,y]) for x,y in seen_pairs]
    # remove "seen" pairs.
    filter_pairs = [set([x,y]) for x,y in all_pairs if set([x,y]) not in seen_pairs]
    return filter_pairs

In [58]:
# begin trying stuff
# start simulations!
def spawn_possible_actions(G):
    actions = []
    if get_remove_interact_candidates(G):
        actions.append(("remove", "interact"))
    if get_remove_hinge_candidates(G):
        actions.append(("remove", "hinge"))
    if get_grow_hinge_candidates(G):
        actions.append(("grow", "hinge"))
    if get_grow_interact_candidates(G):
        actions.append(("grow", "interact"))
    return actions

spawn_possible_actions(G)

[('grow', 'hinge'), ('grow', 'interact')]

In [59]:
def spawn_action_candidates(G, action, transform):
    if action == 'grow':
        if transform == 'hinge':
            return get_grow_hinge_candidates(G)
        elif transform == 'interact':
            return get_grow_interact_candidates(G)
        else:
            raise Exception("invalid transform function")
    if action == 'remove':
        if transform == 'hinge':
            return get_remove_hinge_candidates(G)
        elif transform == 'interact':
            return get_remove_interact_candidates(G)
        else:
            raise Exception("invalid transform function")
    else:
        raise Exception("invalid grow function")

In [60]:
def spawn_new_model(G):
    # to spawn a model, spawn possible actions:    
    all_actions = spawn_possible_actions(G)
    action, transform = random.choice(all_actions)
    # this just returns the function that is randomly chosen.
    candidates = spawn_action_candidates(G, action, transform)
    selected_node = random.choice(candidates)
    return {'action': action, 
            'transform': transform,
            'selected_node': selected_node}
    

In [62]:
def spawn_new_pipeline(G):
    """
    Take in a networkx graph as a dictionary object
    
    We will convert a graph to pipeline objects,
    and then reorder based on distance to root node. 
    
    Usage for shortest path: 
    print(nx.shortest_path_length(G,source=0,target=4))
    
    hopefully this works...
    """
    # just randomly add...pipeline transformations   
    g_nodes = graph_to_dict(G)['nodes']
    
    # add all hinge nodes...
    hinge_info = [("hinge_{}".format(node), Hinge(attr['hinge_children']), node) 
                  for node, attr in g_nodes if 'hinge_children' in attr.keys()]
    
    # add all interaction nodes...
    interact_info = [("interact_{}".format('_'.join(node['interaction_parent'])), 
                      Interaction(node['interaction_parent']), node)
                     for node, attr in g_nodes if 'interaction_parent' in attr.keys()]
    
    
    # join all together
    pipeline = hinge_info + interact_info
    
    # sort the pipeline by location of node to root...
    sorted(pipeline, key=lambda x: nx.shortest_path_length(G,source="root",target=x))    
    return pipeline