<a href="https://colab.research.google.com/github/ArshockAbedan/scikit-learn/blob/main/Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
X = [[1, 2, 3], # 2samples, 3 features
     [11, 12, 13]]
y = [0, 1] # classes of each sample
clf.fit(X,y)

RandomForestClassifier(random_state=0)

In [5]:
clf.predict(X) # predict classes of the training data

array([0, 1])

In [6]:
clf.predict([[4, 5, 6], [14, 15, 16]]) # predict classes of new data

array([0, 1])

Transformers and pre-processors

In [8]:
from sklearn.preprocessing import StandardScaler
X = [[0, 15],
     [1, -10]]
# scale data according to computed scaling values
StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

Pipelines: chaining pre-processors and estimators

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import  LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# create a pipeline object
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression()
    )

# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# fit the whole pipeline
pipe.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [26]:
# we can now use it like any other estimator
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

Model evaluation

In [34]:
from sklearn.datasets import make_regression
from sklearn.linear_model import  LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()

result = cross_validate(lr, X, y) # defualts to 5-fold cv
result['test_score']

array([1., 1., 1., 1., 1.])

In [33]:
result

{'fit_time': array([0.03953314, 0.01855278, 0.01960015, 0.01883054, 0.01907635]),
 'score_time': array([0.00074077, 0.00076795, 0.00073838, 0.00071812, 0.0007391 ]),
 'test_score': array([1., 1., 1., 1., 1.])}

Automatic parameters searches

In [50]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(8, 15),
                       'max_depth': randint(10,15)}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=10,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)


RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f083260e810>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f083260e8d0>},
                   random_state=0)

In [51]:
search.best_params_

{'max_depth': 14, 'n_estimators': 13}

In [52]:
# the search object now acts like a normal random forst estimator 
# with max_depth=14 and n_estimators=13
search.score(X_test, y_test)

0.7758366858623206