In [2]:
import sklearn

# 1. FIT AND PREDICT ESTIMATORS

Scikit Learn has dozens of ML algorithms called estimators.
Each estimator can be fitted into some data

- .fit() and .predict() methods

- Found in the set_config package from sklearn

In [9]:
from sklearn import set_config
set_config(display="diagram")

## A) Fit

In [33]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

X = [[1, 2, 3], [11, 12, 13]]
y = [0, 1]

model.fit(X, y)

## B) Predict

In [23]:
model.predict(X)

array([0, 1])

In [27]:
model.predict([[15, 54, 78], [23, 100, 677]])

array([1, 1])

In [44]:
model.predict([[12, 54, 75], [908, 1002, 4567]])

array([1, 1])

## C) Predict class probabilities 
- Use .predict_proba

In [40]:
model.predict_proba([[15, 54, 78], [23, 100, 677]])

array([[0.26, 0.74],
       [0.26, 0.74]])

In [42]:
model.predict_proba([[12, 54, 75], [908, 1002, 4567]])

array([[0.26, 0.74],
       [0.26, 0.74]])

In [46]:
model.predict_proba(X)

array([[0.78, 0.22],
       [0.26, 0.74]])

## D) Logistic regresson example

In [49]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)

clf = LogisticRegression(random_state=0,max_iter=1000).fit(X, y)

In [53]:
clf.predict(X[:2, :])

array([0, 0])

In [55]:
clf.predict_proba(X[:100, :])

array([[9.81543634e-01, 1.84563518e-02, 1.44999548e-08],
       [9.71309564e-01, 2.86904057e-02, 3.01876205e-08],
       [9.85258131e-01, 1.47418565e-02, 1.23332643e-08],
       [9.76054632e-01, 2.39453281e-02, 3.96624007e-08],
       [9.85202027e-01, 1.47979605e-02, 1.20019954e-08],
       [9.70125413e-01, 2.98745134e-02, 7.40270794e-08],
       [9.86755390e-01, 1.32445903e-02, 1.99707325e-08],
       [9.76109658e-01, 2.38903145e-02, 2.77234361e-08],
       [9.79629088e-01, 2.03708818e-02, 3.05698294e-08],
       [9.68744393e-01, 3.12555758e-02, 3.17052457e-08],
       [9.76164556e-01, 2.38354241e-02, 1.93782725e-08],
       [9.75183658e-01, 2.48162981e-02, 4.39134333e-08],
       [9.74216253e-01, 2.57837253e-02, 2.15029380e-08],
       [9.91870803e-01, 8.12919274e-03, 3.88627829e-09],
       [9.87959537e-01, 1.20404599e-02, 2.84785852e-09],
       [9.86584253e-01, 1.34157341e-02, 1.29530873e-08],
       [9.87917304e-01, 1.20826869e-02, 9.27558969e-09],
       [9.81290679e-01, 1.87093

In [61]:
X[0:100]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [63]:
y[0:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [57]:
clf.score(X, y)

0.9733333333333334

# 2. Transformers and pre-processors

They follow the same API model as the estimator objects

The **transform** method puts out a newly transformed matrix X.

Scaling is like converting measurements to the same units.
When we scale data using methods like **.StandardScaler**, we're often transforming the numbers to fit a **normal distribution**.

Many machine learning algorithms work better when the data is normally distributed. It helps the algorithms learn patterns more effectively, leading to better predictions.

## A) Transforming
- Using **.transform()**

In [83]:
from sklearn.preprocessing import StandardScaler
import numpy as np

X = np.array([1,2,3,4,5,6]).reshape(-1, 1)

#Scale the data according to computed scaling values
Fx = StandardScaler().fit(X)
Fx.transform(X)

array([[-1.46385011],
       [-0.87831007],
       [-0.29277002],
       [ 0.29277002],
       [ 0.87831007],
       [ 1.46385011]])

### i) Using the "ColumnTransfomer" designed for when yo are workign with dataframes

In [128]:
import pandas as pd
X = pd.DataFrame ({
    "city": ["New York", "Los Angeles", "Chicago", "Houston"],
    "title": ["The Great Escape", "City of Stars", "Windy Days", "Lone Ranger"],
    "expert_rating": [8.5, 7.8, 8.0, 7.2],
    "user_rating": [8.8, 7.6, 7.9, 7.4]})
X

Unnamed: 0,city,title,expert_rating,user_rating
0,New York,The Great Escape,8.5,8.8
1,Los Angeles,City of Stars,7.8,7.6
2,Chicago,Windy Days,8.0,7.9
3,Houston,Lone Ranger,7.2,7.4


## B) Preprocessing Steps:

- Impute missing values (e.g., fill missing ages with the median).
- Scale numerical features (e.g., prices between 0 and 1).
- Encode categorical features (e.g., convert "Male/Female" to 0/1).

In [130]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
column_trans =ColumnTransformer([('categories', OneHotEncoder(dtype='int'), ['city']), ('title_bow', CountVectorizer(), 'title')], remainder='drop', verbose_feature_names_out=False)

In [132]:
column_trans.fit(X)

In [140]:
column_trans.get_feature_names_out()

array(['city_Chicago', 'city_Houston', 'city_Los Angeles',
       'city_New York', 'city', 'days', 'escape', 'great', 'lone', 'of',
       'ranger', 'stars', 'the', 'windy'], dtype=object)

In [134]:
column_trans.transform(X).toarray()

array([[0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [136]:
column_trans

# 3. Pipelining

- Combining **transformers and estimators (predictors)** into a single combined object a **PIPELINE**

In [109]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


## A) Create a pipeline object

- All estimators in a pipeline except the last one, must be a **Transformers**
- Only the last one can be any type an **estimator** or a **transformer** or a **classifier** etc

In [114]:
pipe = make_pipeline(
    StandardScaler(),
    MinMaxScaler(),
    LogisticRegression()
    )

Loading the iris dataset and splitting it into train and test sets

In [142]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Fitting the whole pipeline

In [120]:
pipe.fit(X_train, y_train)

In [122]:
#Pipeline being used like any other estimator

accuracy_score(pipe.predict(X_train), (y_train))

0.9285714285714286

# 4. Model Evaluation

Fitting a model into data does not mean it will be perfect at predictions. It needs to be fine-tuned.

- Fine-tuning refers to the process of optimizing a model's parameters and hyperparameters to improve its performance. This can involve techniques such as:
   - Cross-validation
   - Grid search
   - Random search to find the best combination of settings.

Fine-tuning is essential to enhance the model's predictive capabilities and ensure it generalizes well to new data.

In [150]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples=100, random_state=0)
lr = LinearRegression()

In [156]:
result = cross_validate(lr, X, y)
X

array([[-1.5415874 ,  0.22739278, -1.35338886, ...,  0.86727663,
         0.40520408, -0.06964158],
       [-1.02125401,  0.39232256,  0.42971434, ..., -0.75693055,
         1.5038265 , -0.036413  ],
       [ 0.65880214, -0.65276145, -1.18488744, ..., -0.91798431,
        -0.04648116, -1.21868383],
       ...,
       [-0.09834549, -1.2550679 , -0.04217711, ..., -0.41783444,
         0.31709817,  0.60367669],
       [ 0.32099947, -1.12658943, -0.33945432, ...,  1.60286363,
         0.27623985, -0.67832984],
       [ 1.83708069, -0.00296475,  1.80144921, ...,  0.11230769,
         0.33109242,  1.51848293]])

In [158]:
y

array([ 5.89981499e+01, -1.51472301e+02,  3.92774675e+00,  1.30275835e+02,
        2.04728060e+00,  1.01587138e+02, -1.81389163e+02, -1.99827729e+02,
        2.36839440e+02,  3.00206479e+02,  2.71801441e+01,  2.76530071e+01,
        4.44317863e+01, -2.97416896e+01, -1.65646056e+02,  1.06302533e+02,
       -2.38325493e+02, -1.27217684e+02, -1.49696870e+02,  3.91961166e+01,
       -3.38549645e+00,  1.44280015e+02, -6.77946699e+01, -8.78067763e+01,
        2.59043702e+02, -1.98514664e+02,  1.24378523e+01, -7.03056179e+01,
        1.53948181e+02, -1.12712395e+02,  6.01916320e+01, -1.02364589e+02,
        8.02760399e+01, -2.54989466e+02, -2.65026472e+01,  8.45554208e+01,
       -7.86065692e+01,  6.59383309e+00, -8.40267190e+01, -1.88440481e+02,
       -1.05322661e+02, -1.48609151e+02,  3.19325058e+02, -1.03811985e+01,
       -1.66106829e+02, -1.92127360e+00, -1.66459184e+02,  2.22568255e+02,
       -1.90733117e+02, -4.47361111e+00, -1.40862199e+02, -1.04998874e+02,
       -1.55406951e+02,  

In [160]:
result['test_score']

array([0.77550683, 0.62690015, 0.88820809, 0.69697641, 0.51431833])

# 5. Automated Parameter Search

All estimators have parameters (often called hyper-parameters) that can be fine-tuned.
-> The generalization power of an estimator often critically depends on a few parameters.

### Example
**RandomForestRegressor** has:
- **n_estimators** parameter that determines the number of trees in the forest
- **max_depth** parameter that determines depth of each tree

Quite often it is not clear what the exact values of these parameters should be since they depend on the data at hand.

In [163]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5),
                       'max_depth': randint(5, 10)}

# Now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)

search.fit(X_train, y_train)

In [165]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [167]:
search.score(X_test, y_test)

0.735363411343253

In [169]:
search.best_estimator_.score(X_test, y_test)

0.735363411343253

____
____
# THE END
____
____