## Data Preprocessing

In [6]:
import pandas as pd
import numpy as np

Import the bike sharing dataset from UC Irvine.

In [90]:
from ucimlrepo import fetch_ucirepo

# fetch bike share dataset
bike_data = fetch_ucirepo(id=275)

# initialize dataframes (as pandas dataframes)
X = bike_data.data.features
y = bike_data.data.targets

# metadata
# print(bike_data.metadata)

# variable information
print(bike_data.variables)

          name     role         type demographic  \
0      instant       ID      Integer        None   
1       dteday  Feature         Date        None   
2       season  Feature  Categorical        None   
3           yr  Feature  Categorical        None   
4         mnth  Feature  Categorical        None   
5           hr  Feature  Categorical        None   
6      holiday  Feature       Binary        None   
7      weekday  Feature  Categorical        None   
8   workingday  Feature       Binary        None   
9   weathersit  Feature  Categorical        None   
10        temp  Feature   Continuous        None   
11       atemp  Feature   Continuous        None   
12         hum  Feature   Continuous        None   
13   windspeed  Feature   Continuous        None   
14      casual    Other      Integer        None   
15  registered    Other      Integer        None   
16         cnt   Target      Integer        None   

                                          description units mis

## Regression Tree on Factor Features - Variance Splitting

One way to choose features for splitting is *weighted variance*. This criteria is appropriate for **level predictor variables**. Essentially, for each predictor, we compute the variance of the target for each value of the predictor variable then compute the weighted sum of these variances to get the weighted variance for the predictor. We then split the predictor space at each node of the tree by picking the feature with the lowest weighted variance and splitting at the level that minimizes the variance of the target feature.

In [223]:
print(pd.__version__)
pd.options.mode.copy_on_write = True

2.1.4


In [91]:
# subsetting X to only include 5 features of interest
x = X[['season', 'holiday', 'weekday', 'workingday', 'weathersit']]

# appending the target variable to x
x['cnt'] = y
x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['cnt'] = y


Unnamed: 0,season,holiday,weekday,workingday,weathersit,cnt
0,1,0,6,0,1,16
1,1,0,6,0,1,40
2,1,0,6,0,1,32
3,1,0,6,0,1,13
4,1,0,6,0,1,1


In [23]:
# function to compute feature variances
def var(data, feature, tgt='cnt'):
    feat_vals = np.unique(data[feature])   # unique feature values
    feat_var = 0
    for val in feat_vals:
        subset = data[data[feature] == val]
        # calc weighted var of subsets
        var_val = (len(subset) / len(data)) * np.var(subset[tgt], ddof=1)    # ddof sets degrees of freedom
        # iterate weighted var of feature
        feat_var += var_val
    return feat_var

In [7]:
var(x, 'season', 'tgt')

30735.628940122348

In [110]:
# classifier algorithm
def classifier(data, origData, features,
               minInstances,    # sets minimum number of instances per node as early stopping criteria
               parentNodeClass = None,
               tgt='cnt'):
    # if all target values have same value, return mean of the target
    if len(data) <= int(minInstances):
        return data[tgt].mean()

    # if dataset is empty, return mean of target from the original dataset
    elif len(data) == 0:
        return origData[tgt].mean()

    # if feature space is empty, return mean of target from the direct parent node
    #     - the parent node is the node which called the current run of the function
    elif len(features) == 0:
        return parentNodeClass

    else:
        # set default value for node (i.e. mean of target)
        parentNodeClass = np.mean(data[tgt])
        # select best split feature
        itemVals = [var(data, feature) for feature in features]
        bestFeatIndex = np.argmin(itemVals)    # select min variance feature by index
        bestFeat = features[bestFeatIndex]

        # create tree structure
        tree = {bestFeat:{}}

        # remove the best split feature from the feature space
        features = [i for i in features if i != bestFeat]    # must copy list to preserve features for other levels

        for val in np.unique(data[bestFeat]):
            subData = data[data[bestFeat] == val]    # subset dataset on the minimum variance feature
            subTree = classifier(subData, origData, feats, minInstances, parentNodeClass = parentNodeClass, tgt = tgt)    # recurse Classifier() on the subset to make a new split
            tree[bestFeat][val] = subTree    # add the subTree grown from subData to the root node

        return tree

In [111]:
tree = classifier(x, x, list(x.columns[:-1]), 5)

In [112]:
from pprint import pprint
pprint(tree)

{'season': {1: {'weathersit': {1: {'weekday': {0: {'holiday': {0: {'workingday': {0: 104.45232815964523}}}},
                                               1: {'holiday': {0: {'workingday': {1: 112.53503184713375}},
                                                               1: {'workingday': {0: 97.62650602409639}}}},
                                               2: {'holiday': {0: {'workingday': {1: 147.22379603399435}},
                                                               1: {'workingday': {0: 97.66666666666667}}}},
                                               3: {'holiday': {0: {'workingday': {1: 124.20056497175142}}}},
                                               4: {'holiday': {0: {'workingday': {1: 128.58073654390935}}}},
                                               5: {'holiday': {0: {'workingday': {1: 130.40882352941176}}}},
                                               6: {'holiday': {0: {'workingday': {0: 117.95620437956204}}}}}},
                       

## Regresseion Tree on Continuous Variables

In [114]:
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 

# variable information 
print(heart_disease.variables) 


feats = ['age', 'trestbps', 'chol']    # list of features

df = X[feats]
df['num'] = y

        name     role         type demographic  \
0        age  Feature      Integer         Age   
1        sex  Feature  Categorical         Sex   
2         cp  Feature  Categorical        None   
3   trestbps  Feature      Integer        None   
4       chol  Feature      Integer        None   
5        fbs  Feature  Categorical        None   
6    restecg  Feature  Categorical        None   
7    thalach  Feature      Integer        None   
8      exang  Feature  Categorical        None   
9    oldpeak  Feature      Integer        None   
10     slope  Feature  Categorical        None   
11        ca  Feature      Integer        None   
12      thal  Feature  Categorical        None   
13       num   Target      Integer        None   

                                          description  units missing_values  
0                                                None  years             no  
1                                                None   None             no  
2              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['num'] = y


In [245]:
def rmse(data, val, feat, tgt):
    lhs = data[data[feat] <= val][tgt]
    rhs = data[data[feat] > val][tgt]

    lhs_pred = lhs.mean()
    rhs_pred = rhs.mean()

    residuals = []
    for obs in lhs: residuals.append((obs - lhs_pred)**2)
    for obs in rhs: residuals.append((obs - rhs_pred)**2)

    return np.sqrt(np.sum(residuals) / len(residuals))

In [146]:
rmse(50, 'age', df, 'num')

1.204928034421781

In [359]:
def branch(data, feats, tgt, pNodeClass=None, minInstances=None, tier=0, maxDepth=None):
    print(f'Tier: {tier}, Features: {feats}')
    pred = data[tgt].mean()

    if pNodeClass is None: 
        pNodeClass = pred
    elif pred == pNodeClass:
        return pNodeClass

    if tier == maxDepth: return pred
    
    if len(data) <= int(minInstances):
        return pred
    else:
        tier += 1
        splits = dict((feat, '') for feat in feats)
        for feat in feats:
            minErr = 0
            for i in range(len(data[feat]) - 1):
                val = (data[feat].iloc[i] + data[feat].iloc[i+1])/2    # midpoint

                try:
                    err = rmse(data, val, feat, tgt)
                    pass
                except ValueError:
                    print(f'{feat}: {data[feat].iloc[i]}')
                    
                if minErr == 0 or err < minErr:
                    minErr = err
                    splits[feat] = (val, minErr)

        bestFeat = feats[np.argmin([splits[feat][1] for feat in feats])]    # minimum rmse feature
        bestSplit = splits[bestFeat][0]
        node = (bestFeat, bestSplit)

        print(splits)
        print(node)
        
        lhs = data[data[bestFeat] < bestSplit]
        lhs = branch(lhs, feats, tgt, pred, minInstances, tier, maxDepth)
        
        rhs = data[data[bestFeat] > bestSplit]
        rhs = branch(rhs, feats, tgt, pred, minInstances, tier, maxDepth)

        tree = {node:{}}
        tree[node] = (lhs, rhs)

        return tree

In [360]:
tree = branch(df, feats, 'num', minInstances=5, maxDepth=None)

Tier: 0, Features: ['age', 'trestbps', 'chol']
{'age': (54.5, 1.1846860027094008), 'trestbps': (142.5, 1.208213493978743), 'chol': (279.5, 1.2116188917910682)}
('age', 54.5)
Tier: 1, Features: ['age', 'trestbps', 'chol']
{'age': (47.0, 1.0459306530691959), 'trestbps': (108.5, 1.0389146900443207), 'chol': (272.0, 1.032682703657598)}
('chol', 272.0)
Tier: 2, Features: ['age', 'trestbps', 'chol']
{'age': (53.0, 0.9856254308197118), 'trestbps': (128.0, 0.9790332759914939), 'chol': (188.0, 0.9804185414512698)}
('trestbps', 128.0)
Tier: 3, Features: ['age', 'trestbps', 'chol']
{'age': (51.5, 1.0567221484178237), 'trestbps': (109.0, 1.0515194229831875), 'chol': (239.0, 1.0575095584461456)}
('trestbps', 109.0)
Tier: 4, Features: ['age', 'trestbps', 'chol']
{'age': (46.5, 0.25819888974716115), 'trestbps': (106.0, 0.25), 'chol': (235.0, 0.23570226039551584)}
('chol', 235.0)
Tier: 5, Features: ['age', 'trestbps', 'chol']
{'age': (42.5, 0.0), 'trestbps': (99.5, 0.0), 'chol': (201.5, 0.0)}
('age', 

In [361]:
pprint(tree)

{('age', 54.5): ({('chol', 272.0): ({('trestbps', 128.0): ({('trestbps', 109.0): ({('chol', 235.0): ({('age', 42.5): (0.0,
                                                                                                                      0.0)},
                                                                                                     0.3333333333333333)},
                                                                                  {('chol', 239.0): ({('age', 37.5): (0.2,
                                                                                                                      {('age', 40.5): (3.3333333333333335,
                                                                                                                                       {('age', 51.5): ({('chol', 198.0): ({('chol', 176.0): ({('trestbps', 115.0): (0.3333333333333333,
                                                                                                                      