In [4]:
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin


In [5]:
# get best split by metric??

def get_classification(y):
    """
    Calculates the misclassification
    
    returns total correct, total size, accuracy    
    """
    if len(y) == 0:
        return 0
    from collections import Counter
    c_y = Counter(y)
    best = max(c_y, key=c_y.get)
    return c_y[best]

def error_on_split(x, y, split):
    """
    parameters:
    
    x : 1d vector of a particular predictor
    y : 1d vector of the response (categorical)
    split: split point of interest
    
    returns the classification error based on this split
    
    Example:
    
    ```py
    error_on_split(list(range(10)), [0,0,0,0,0,1,1,1,1,1], -1)
    error_on_split(list(range(10)), [0,0,0,0,0,1,1,1,1,1], 4.5)
    ```
    """
    x = np.array(x)
    y = np.array(y)
    
    split1 = np.where(x <  split)
    split2 = np.where(x >= split)
    
    # check if "legal" split
    # this is an extra condition so that the model doesn't have any splits with 
    # too few in one group...
    min_group_size = min(len(split1[0]), len(split2[0]))
    if float(min_group_size)/len(y) < 0.10 :
        return 1
    
    total1 = get_classification(y[split1])
    total2 = get_classification(y[split2])
    
    return 1-(float(total1+total2)/len(y))


In [27]:
def best_split(x, y, psplit=error_on_split):
    """
    parameters:
    
    x : 1d vector of a particular predictor
    y : 1d vector of the response (categorical)
    
    return the best split based on classification error
    
    tuple: split, metric score
    
    usage:
    best_split(list(range(10)), [0,0,0,0,0,1,1,1,1,1])
    """
    x = np.array(x)
    y = np.array(y)
    
    pos_split = sorted(list(set(x)))[1:-1] 
    best_split = {split:error_on_split(x, y, split) for split in pos_split}
    split = min(best_split, key=best_split.get) 
    split1 = np.where(x <  split)
    split2 = np.where(x >=  split)
    
    return (split, best_split[split], len(split1[0]), len(split2[0]))

In [28]:
def best_feature_split(x, y):
    """
    parameters:
    
    x : 1d vector of a particular predictor
    y : 1d vector of the response (categorical)
    
    return the best split based on classification error
    
    usage:
    
    ```py
    import numpy as np
    from sklearn import datasets
    
    iris = datasets.load_iris()
    
    X = iris.data[:100, :]
    y = iris.target[:100]
    best_feature_split(X[:, :], y)
    ```
    """
    if x.ndim > 1:
        feature_cols = x.shape[1]
        feat_split = {col: best_split(x[:, col], y) for col in range(feature_cols)}
        rpart = min(feat_split.items(), key=lambda x: x[1][1])
        return {
            'column': rpart[0],
            'node': rpart[1][0],
            'metric': rpart[1][1],
            'split_size': (rpart[1][2],rpart[1][3])
        }
    else:
        node, val, splitl, splitr = best_split(x, y)
        return {
            'column':0,
            'node': node, 
            'metric': val,
            'split_size': (splitl, splitr)
        }

In [37]:
a = np.array([1,2,3,4,5,6])
np.maximum.reduce([a-4, np.zeros(a.shape)])




array([ 0.,  0.,  0.,  0.,  1.,  2.])

In [54]:

class Hinge(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    
    mask: the column indices to keep
    hinge: the hinge point to calculate    
    """
    
    def best_split(x, y, psplit=error_on_split):
        """
        parameters:

        x : 1d vector of a particular predictor
        y : 1d vector of the response (categorical)

        return the best split based on classification error

        tuple: split, metric score

        usage:
        best_split(list(range(10)), [0,0,0,0,0,1,1,1,1,1])
        """
        x = np.array(x)
        y = np.array(y)

        pos_split = sorted(list(set(x)))[1:-1] 
        best_split = {split:error_on_split(x, y, split) for split in pos_split}
        split = min(best_split, key=best_split.get) 
        split1 = np.where(x <  split)
        split2 = np.where(x >=  split)

        return (split, best_split[split], len(split1[0]), len(split2[0]))
    
    def __init__(self, mask=0, hinge=None, psplit=error_on_split):
        self.mask = mask
        self.hinge = hinge
        self.psplit = psplit
    
    def fit(self, x, y=None):        
        x = x[:, self.mask]
        
        # replace psplit with choice of metric.
        hinge_point, metric, _, _ = best_split(x, y, psplit=self.psplit)
        self.hinge = hinge_point
    
    def transform(self, x):
        x1 = x[:, self.mask]
        x1_shape = x1.shape
        pos_hinge = np.maximum.reduce([x1-self.hinge, np.zeros(x1_shape)])
        neg_hinge = np.maximum.reduce([self.hinge-x1, np.zeros(x1_shape)])
        
        return np.hstack([x, pos_hinge.reshape(-1, 1), neg_hinge.reshape(-1, 1)])
        
        

In [55]:
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()

X = iris.data[:100, :]
y = iris.target[:100]

hinge = Hinge()
hinge.fit(X, y)

In [56]:
hinge.hinge

5.5

In [57]:
hinge.transform(X)

array([[ 5.1,  3.5,  1.4,  0.2,  0. ,  0.4],
       [ 4.9,  3. ,  1.4,  0.2,  0. ,  0.6],
       [ 4.7,  3.2,  1.3,  0.2,  0. ,  0.8],
       [ 4.6,  3.1,  1.5,  0.2,  0. ,  0.9],
       [ 5. ,  3.6,  1.4,  0.2,  0. ,  0.5],
       [ 5.4,  3.9,  1.7,  0.4,  0. ,  0.1],
       [ 4.6,  3.4,  1.4,  0.3,  0. ,  0.9],
       [ 5. ,  3.4,  1.5,  0.2,  0. ,  0.5],
       [ 4.4,  2.9,  1.4,  0.2,  0. ,  1.1],
       [ 4.9,  3.1,  1.5,  0.1,  0. ,  0.6],
       [ 5.4,  3.7,  1.5,  0.2,  0. ,  0.1],
       [ 4.8,  3.4,  1.6,  0.2,  0. ,  0.7],
       [ 4.8,  3. ,  1.4,  0.1,  0. ,  0.7],
       [ 4.3,  3. ,  1.1,  0.1,  0. ,  1.2],
       [ 5.8,  4. ,  1.2,  0.2,  0.3,  0. ],
       [ 5.7,  4.4,  1.5,  0.4,  0.2,  0. ],
       [ 5.4,  3.9,  1.3,  0.4,  0. ,  0.1],
       [ 5.1,  3.5,  1.4,  0.3,  0. ,  0.4],
       [ 5.7,  3.8,  1.7,  0.3,  0.2,  0. ],
       [ 5.1,  3.8,  1.5,  0.3,  0. ,  0.4],
       [ 5.4,  3.4,  1.7,  0.2,  0. ,  0.1],
       [ 5.1,  3.7,  1.5,  0.4,  0. ,  0.4],
       [ 4