In [1]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
X = [[ 1,  2,  3],  
     [11, 12, 13]]
y = [0, 1]  
clf.fit(X, y)

RandomForestClassifier(random_state=0)

In [2]:
clf.predict(X)

array([0, 1])

In [3]:
clf.predict([[4, 5, 6], [14, 15, 16]])

array([0, 1])

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
X = [[0, 15],
     [1, -10]]
X

[[0, 15], [1, -10]]

In [6]:
# scale data according to computed scaling values
StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

# Pipelines: chaining pre-processors and estimators

Transformers and estimators (predictors) can be combined together into a single unifying object: a Pipeline. The pipeline offers the same API as a regular estimator: it can be fitted and used for prediction with fit and predict. As we will see later, using a pipeline will also prevent you from data leakage, i.e. disclosing some testing data in your training data.

In [7]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
# create a pipeline object
pipeline = make_pipeline(StandardScaler(), LogisticRegression())
pipeline

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [9]:
# load the iris dataset and split it into train and test sets
X,y = load_iris(return_X_y=True)
X,y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0)

In [12]:
# fit the whole pipeline
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [13]:
# we can now use it like any other estimator
accuracy_score(pipeline.predict(X_test), y_test)

0.9736842105263158

# Model evaluation

In [14]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [15]:
X,y = make_regression(n_samples = 1000, random_state = 0)
X,y

(array([[ 0.41929687, -1.5489299 ,  0.65218686, ..., -0.81368398,
         -2.03884275,  0.90000294],
        [-2.06947249,  0.72712806,  0.0975975 , ..., -0.35978104,
         -0.74513907, -0.55050613],
        [-0.37595997,  0.66414405,  1.02239232, ...,  0.50481546,
         -2.83201187, -0.79978614],
        ...,
        [-0.7719197 , -1.33667649, -0.72733814, ..., -0.59830311,
         -0.60986158,  1.69242973],
        [ 0.67198393, -1.50733364,  1.17622157, ...,  2.05921537,
         -1.11140442,  0.01787532],
        [ 1.10334268, -0.59531919, -0.29831814, ..., -0.89706521,
         -0.11546748, -1.299286  ]]),
 array([-2.04057963e+02, -3.76325761e+02,  2.37301930e+01,  1.12558992e+02,
         6.94873807e+01,  1.75741198e+02,  1.10343738e+02, -1.15760957e+02,
        -1.54916670e+02,  2.06125842e+02,  1.47533235e+02, -8.22008168e+01,
         1.98635479e+02, -2.04385872e+02,  5.73986966e+01,  1.50784658e+02,
        -1.83745007e+02,  1.62805337e+02,  8.92132007e+00, -2.6193545

In [16]:
lr = LinearRegression()
lr

LinearRegression()

In [17]:
result = cross_validate(lr, X, y)# defaults to 5-fold CV
result

{'fit_time': array([0.03347349, 0.02396774, 0.01597857, 0.01597881, 0.01597929]),
 'score_time': array([0.        , 0.        , 0.00399375, 0.00399494, 0.        ]),
 'test_score': array([1., 1., 1., 1., 1.])}

In [18]:
result['test_score']  # r_squared score is high because dataset is easy

array([1., 1., 1., 1., 1.])

# Automatic parameter searches

In [19]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [20]:
X, y = fetch_california_housing(return_X_y = True)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0)

In [22]:
# define the parameter space that will be searched over
param_distributions = {
    'n_estimators' : randint(1,5),
    'max_depth' : randint(5,10)
}

In [23]:
#create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator = RandomForestRegressor(random_state=0),
                            n_iter = 5,
                            param_distributions = param_distributions,
                            random_state = 0)
search

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002AC7D636490>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002AC1D9FB2B0>},
                   random_state=0)

In [24]:
search.fit(X_train,y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002AC7D636490>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002AC1D9FB2B0>},
                   random_state=0)

In [25]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [27]:
# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
search.score(X_test,y_test)

0.735363411343253