[Reference](https://medium.com/geekculture/scikit-learn-0-24-top-5-new-features-you-need-to-know-7af15d8cdeac)

In [1]:
pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.3 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0


# 1.Mean Absolute Percentage Error (MAPE)


In [2]:
from sklearn.metrics import mean_absolute_percentage_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(mean_absolute_percentage_error(y_true, y_pred))

0.3273809523809524


# 2. OneHotEncoder Supports Missing Values


In [3]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [4]:
# intialise data of lists.
data = {'education_level':['primary', 'secondary', 'bachelor', np.nan,'masters',np.nan]}
  
# Create DataFrame
df = pd.DataFrame(data)
  
# Print the output.
print(df)

  education_level
0         primary
1       secondary
2        bachelor
3             NaN
4         masters
5             NaN


In [5]:
enc = OneHotEncoder()
enc.fit_transform(df).toarray()

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

# 3.New method for Feature Selection


In [6]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

In [7]:
X, y = load_iris(return_X_y=True, as_frame=True)
feature_names = X.columns

In [8]:
knn = KNeighborsClassifier(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, n_features_to_select=2,direction='backward')
sfs.fit(X,y)
print("Features selected by backward sequential selection: "
      f"{feature_names[sfs.get_support()].tolist()}")

Features selected by backward sequential selection: ['petal length (cm)', 'petal width (cm)']


# 4.New Methods for Hyper-Parameters Tuning


In [9]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import HalvingRandomSearchCV
from scipy.stats import randint

In [10]:
X, y = make_classification(n_samples=1000)

In [11]:
clf = RandomForestClassifier(n_estimators=20)

In [12]:
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 11),
              "min_samples_split": randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [13]:
rsh = HalvingRandomSearchCV(
    estimator=clf,
    param_distributions=param_dist,
    cv = 5,
    factor=2,
    min_resources = 20)

In [14]:
rsh.fit(X,y)

HalvingRandomSearchCV(estimator=RandomForestClassifier(n_estimators=20),
                      factor=2, min_resources=20,
                      param_distributions={'bootstrap': [True, False],
                                           'criterion': ['gini', 'entropy'],
                                           'max_depth': [3, None],
                                           'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f323a79ad50>,
                                           'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f323a79ae10>},
                      refit=<function _refit_callable at 0x7f323941f7a0>)

In [15]:
print(rsh.n_iterations_ )

6


In [16]:
print(rsh.n_candidates_ )

[50, 25, 13, 7, 4, 2]


In [17]:
print(rsh.n_resources_)

[20, 40, 80, 160, 320, 640]


In [18]:
print(rsh.best_params_)

{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 5, 'min_samples_split': 3}


# 5. New self-training meta-estimator for semi-supervised learning

In [19]:
import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC

In [20]:
rng = np.random.RandomState(42)
iris = datasets.load_iris()
random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
iris.target[random_unlabeled_points] = -1

In [21]:
svc = SVC(probability=True, gamma="auto")

In [22]:
self_training_model = SelfTrainingClassifier(base_estimator=svc)

In [23]:
self_training_model.fit(iris.data, iris.target)

SelfTrainingClassifier(base_estimator=SVC(gamma='auto', probability=True))