# Bagging, Random Forests, Boosting

Utilice la clase `regression_tree` de `regression_tree.py` para entrenar un arbol de decisión en los datos de `auto.csv`

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from regression_tree import regression_tree

In [2]:
auto_dataset = pd.read_csv('../data/auto.csv')

In [3]:
auto_dataset.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [107]:
train_dataset = auto_dataset[['mpg', 'cylinders', 'displacement', 'weight', 'acceleration', 'year', 'origin']]

In [133]:
np.random.seed(100)
rnd = np.random.rand(len(train_dataset))
train_df = train_dataset[rnd < 0.8]
test_df = train_dataset[rnd >= 0.8]
len(train_df), len(test_df)
# train_df

(317, 80)

Cree una función que regrese el error cuadrado medio del modelo.

In [11]:
def rms_error(y, yhat):
    return (y - yhat).apply(lambda x: x*x).mean()

In [12]:
auto_tree = regression_tree()
auto_tree.fit(train_df, 'mpg', alpha=0, min_points_per_leaf=1)

In [13]:
rms_error(train_df.mpg, auto_tree.predict(train_df))

1.2840036803364876

In [14]:
rms_error(test_df.mpg, auto_tree.predict(test_df))

10.357282118055554

In [15]:
# pd.DataFrame({'y':test_df.mpg, 'yhat': auto_tree.predict(test_df)})

# Bootstrap

Cree una función que genere muestras de datos con reemplazo.

In [16]:
def bootstrap_sample(df, N=None):
    if not N:
        N = len(df)
    selection = np.random.choice(df.index, size=N)
    return df.loc[selection]

In [17]:
# bootstrap_sample(auto_dataset).head()

# Random Forest
Cree una clase que tenga la misma interfaz de `regression_tree`, es decir, una función **fit** que entrene **B** arbóles de decisión, y una función **predict** que evalúe los arboles y regrese
$$ \frac{1}{B}\sum_{b=1}^B \hat f^b (x) $$

In [18]:
class random_forest(object):
    def __init__(self):
        self.trees = []

    def fit(self, df, y, B=10, predictors=None, alpha=1, min_points_per_leaf=5):
        if not predictors:
            predictors = list(df.columns)
            predictors.remove(y)
        self.trees = []
        for b in range(B):
            tree = regression_tree()
            df_new = bootstrap_sample(df)
            predictors_new = np.random.choice(predictors,
                                              size=int(np.sqrt(len(predictors))),
                                              replace=False)
            tree.fit(df_new, y, predictors=predictors, 
                                alpha=alpha,
                                min_points_per_leaf=min_points_per_leaf)
            self.trees.append(tree)

    def predict(self, df):
        prediction = pd.Series([0]*len(df), index=df.index)
        for tree in self.trees:
            prediction += tree.predict(df)
        return prediction/len(self.trees)

In [134]:
model = random_forest()
predictors = ['cylinders', 'displacement', 'weight', 'acceleration', 'year', 'origin']
model.fit(train_df, 'mpg', 3, predictors=predictors, alpha=0.5, min_points_per_leaf=50)

KeyboardInterrupt: 

In [20]:
yhat = model.predict(test_df)
yhat.head()

3     18.509841
7     14.733333
10    14.000000
15    19.850000
16    21.711111
dtype: float64

In [21]:
rms_error(train_df.mpg, model.predict(train_df))

3.1807236247916633

In [23]:
rms_error(test_df.mpg, model.predict(test_df))

7.97401357024754

# Boosted Decision Trees
Cree una clase que entrene arboles de decisión siguiendo el algoritmo de boosting:
1. Set $\hat f(x)$ and $r_i=y_i$ for all $i$ in the training set.
2. For $b = 1,2,...,B$, repeat:
   1. Fit a tree $\hat f^b$ with $d$ splits ($d+1$ terminal nodes) to the training data $(X, r)$
   2. Update $\hat f$ by adding in a shrunken verson fo the new tree: $\hat f(x) \leftarrow \hat f(x) + \lambda\hat f^b(x)$
   3. Update the residuals: $r_i \leftarrow r_i - \lambda\hat f^b(x_i)$
3. Output the boosted model:
    $$\hat f(x) = \sum_{b=1}^B \lambda\hat f^b(x)$$

In [129]:
from copy import deepcopy
class boosted_tree(object):
    def __init__(self):
        self.trees = []
    
    def fit(self, df, y, B=10, predictors=None, alpha=1,lambda_ = 1, min_points_per_leaf=5):
        repo = deepcopy(df.loc[:,y])
        self.lambda_ = lambda_
        if not predictors:
            predictors = list(df.columns)
            predictors.remove(y)
        self.trees = []  
        for b in range(B):
            b_tree = regression_tree()
            b_tree.fit(df, y, predictors, alpha, min_points_per_leaf)
            self.trees.append(b_tree)
            df.loc[:,y] = df.loc[:,y] - lambda_ * b_tree.predict(df)
        df.loc[:,y] = repo.values
#         print(df.loc[:,y].head())
            
    def predict(self, df):
        prediction = pd.Series([0]*len(df), index=df.index)
        for tree in self.trees:
            prediction += tree.predict(df)
        return self.lambda_ * prediction
        

In [158]:
model = boosted_tree()
predictors = ['cylinders', 'displacement', 'weight', 'acceleration', 'year', 'origin']
model.fit(train_df, 'mpg', 3, predictors, 0, 1, 50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


0    18.0
1    15.0
2    18.0
4    17.0
5    15.0
Name: mpg, dtype: float64


In [159]:
rms_error(train_df.mpg, model.predict(train_df))

0.13970141773019948

In [160]:
rms_error(test_df.mpg, model.predict(test_df))

10.138894819589606