In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [23]:
df = pd.read_csv('../data/housing.csv')

In [24]:
tree = RandomForestRegressor()

In [25]:
tree.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [26]:
from sklearn.model_selection import cross_val_score

In [27]:
X=df.iloc[:,:-1]
y=df['PRICE']

In [34]:
leaf_size = [1,5,10,15,25]
cross_val_scores = []

for size in leaf_size:
    tree.set_params(min_samples_leaf=size)
    scores = cross_val_score(estimator=tree, X=X,y=y,cv=10)
    cross_val_scores.append((scores,size))





In [35]:
cross_val_scores

[(array([ 0.6709722 ,  0.78512784,  0.35890556,  0.72041252,  0.80325851,
          0.7143548 ,  0.32880787,  0.31136097, -0.44745333,  0.22258762]), 1),
 (array([ 0.69564336,  0.7711569 , -0.66855512,  0.7629585 ,  0.81988442,
          0.66614418,  0.48173639,  0.35599331, -0.29769545,  0.29622589]), 5),
 (array([ 0.69048802,  0.75955492,  0.38783758,  0.79954242,  0.82161567,
          0.60155522,  0.44861862,  0.34100976, -0.1914158 , -0.04730108]),
  10),
 (array([ 0.65665015,  0.7716125 ,  0.09634803,  0.75369383,  0.79952719,
          0.47897399,  0.29318031,  0.34089097, -0.06888433,  0.18215342]),
  15),
 (array([ 0.71025362,  0.71228627,  0.45035646,  0.6337338 ,  0.68002259,
          0.37663835,  0.02620275,  0.28896712, -0.82718469, -0.15536633]),
  25)]

In [36]:
rf = RandomForestRegressor()

In [38]:
#important to tweak, max_features, min_samples_leaf
rf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
#the less the leaves the less variance. The more specific the model, the less generalizable they are

In [52]:
#the less samples per leaf and the more features allowed per leaf, the more overfit your model can become
max_features = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
cross_val_scores = []
rf.set_params(n_estimators = 40, min_samples_leaf = 5)

for size in max_features:
    rf.set_params(max_features = size)
    scores = cross_val_score(estimator=rf, X=X,y=y,cv=10)
    cross_val_scores.append((np.mean(scores),size))

In [53]:
cross_val_scores

[(0.4110186732769763, 0.2),
 (0.4780195312950122, 0.3),
 (0.5390554656655417, 0.4),
 (0.566949427611253, 0.5),
 (0.5290838505816107, 0.6),
 (0.5142960715280939, 0.7),
 (0.5094373828279648, 0.8),
 (0.48503675246796424, 0.9),
 (0.23554439191071003, 1)]

In [54]:
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

In [55]:
lreg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [56]:
lreg.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

In [57]:
from sklearn.linear_model import Ridge, Lasso

In [58]:
rreg= Ridge()

In [61]:
rreg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [62]:
lasso = Lasso()

In [63]:
lasso.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [83]:
alphas = np.logspace(-4,4,9)
rreg_scores = []
for alpha in alphas:
    rreg.set_params(alpha = alpha)
    scores = cross_val_score(estimator=rreg, X=X,y=y,cv=10)
    rreg_scores.append((np.mean(scores),alpha))

In [84]:
rreg_scores

[(0.20254353667974848, 0.0001),
 (0.2026742654697647, 0.001),
 (0.20396289560063754, 0.01),
 (0.2151894699944341, 0.1),
 (0.2561668703759328, 1.0),
 (0.27093538252517557, 10.0),
 (0.2723683530459981, 100.0),
 (0.19960184332575434, 1000.0),
 (0.04083886314371482, 10000.0)]

In [85]:
alphas = np.logspace(-4,4,9)
lasso_scores = []
for alpha in alphas:
    lasso.set_params(alpha = alpha)
    scores = cross_val_score(estimator=lasso, X=X,y=y,cv=10)
    lasso_scores.append((np.mean(scores),alpha))

In [86]:
lasso_scores

[(0.20285126580483662, 0.0001),
 (0.20571066984909261, 0.001),
 (0.23028015666320525, 0.01),
 (0.26432738934873734, 0.1),
 (0.19828974626177498, 1.0),
 (-0.0286293787901828, 10.0),
 (-0.8774829500063221, 100.0),
 (-1.2860830508551744, 1000.0),
 (-1.2860830508551744, 10000.0)]

In [87]:
lasso.set_params(alpha = 0.1)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [88]:
lasso.fit(X,y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [92]:
lasso.score(X,y)

0.7269834862602695