### Based on [Scikit-Learn cheat sheet](http://datacamp-community-prod.s3.amazonaws.com/eb807da5-dce5-4b97-a54d-74e89f14266b)

In [88]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import *
from sklearn.impute import SimpleImputer

## Loading data

In [12]:
# Load toy dataset
iris_X, iris_y = load_iris(return_X_y=True)

In [14]:
iris_X.shape # training features

(150, 4)

In [15]:
iris_y.shape # target labels

(150,)

## Data preprocessing

In [82]:
# Standardization
scaler = StandardScaler().fit(X_train)
X_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [83]:
# Normalization
normalizer = Normalizer().fit(X_train)
X_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [85]:
# Binarization
binarizer = Binarizer(threshold=0.0).fit(iris_X)
X_bin = binarizer.transform(iris_X)

In [86]:
# Label encoding
enc = LabelEncoder()
enc.fit_transform(iris_y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
# Impute missing values
imp = SimpleImputer(missing_values=0, strategy='mean')
imp.fit_transform(X_train)

In [None]:
# Generate polynomial features
poly = PolynomialFeatures(degree=4)
poly.fit_transform(X_train)

## Train-test splits

In [17]:
# Automatic train-test split
X_train, X_test, y_train, y_test = train_test_split(iris_X, 
                                                    iris_y,
                                                    train_size=0.8,
                                                    test_size=0.2)

## Creating a model

In [19]:
# Initialize supervised model: logistic regression
logRegModel = LogisticRegression()

In [20]:
# Initialize supervised model: linear regression
linRegModel = LinearRegression()

In [21]:
# Initialize supervised model: support vector machine
svcModel = SVC(kernel='linear')

In [22]:
# Initialize supervised model: naive Bayes
gnbModel = GaussianNB()

In [23]:
# Initialize supervised model: K-nearest neighbors
knn = KNeighborsClassifier(n_neighbors=6)

In [24]:
# Initialize unsupervised model: PCA
pca = PCA(n_components=0.95)

In [25]:
# Initialize unsupervised model: K means
kMeansModel = KMeans(n_clusters=4)

## Basic model fitting

In [26]:
# Fit logistic regression model
logRegModel.fit(X_train, y_train)

LogisticRegression()

In [27]:
# Fit SVM
svcModel.fit(X_train, y_train)

SVC(kernel='linear')

In [28]:
# Fit KNN model
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=6)

In [29]:
# Perform K-means clustering
kMeansModel.fit(X_train)

KMeans(n_clusters=4)

In [None]:
# Perform PCA decomposition
pca.fit_transform(X_train)

## Prediction

In [32]:
# Predict with logistic regression model
y_pred_logreg = logRegModel.predict(X_test)

In [33]:
# Predict with SVM model
y_pred_svc = svcModel.predict(X_test)

In [34]:
# Predict with KNN model
y_pred_knn = knn.predict(X_test)

In [36]:
# Predict with K-means model
y_pred_Kmeans = kMeansModel.predict(X_test)

## Evaluation

In [38]:
# Classification accuracy (estimator score)
knn.score(X_test, y_test)

0.9666666666666667

In [39]:
# Classification accuracy (extrinsic scoring function)
accuracy_score(y_test, y_pred_logreg)

0.9333333333333333

In [41]:
# Classification report (precision, recall, F1-score)
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      0.80      0.89        10
           2       0.86      1.00      0.92        12

    accuracy                           0.93        30
   macro avg       0.95      0.93      0.94        30
weighted avg       0.94      0.93      0.93        30



In [44]:
# Confusion matrix
print(confusion_matrix(y_test, y_pred_Kmeans))

[[0 8 0 0]
 [5 0 0 5]
 [0 0 5 7]
 [0 0 0 0]]


In [45]:
# R^2 score
r2_score(y_test, y_pred_logreg)

0.8972602739726028

In [46]:
# Mean squared error (MSE)
mean_squared_error(y_test, y_pred_knn)

0.03333333333333333

In [48]:
# Mean absolute error
mean_absolute_error(y_test, y_pred_svc)

0.06666666666666667

In [49]:
# Cluster metric: adjusted Rand index
adjusted_rand_score(y_test, y_pred_Kmeans)

0.47264652961218945

In [50]:
# Cluster metric: homoegeneity
homogeneity_score(y_test, y_pred_Kmeans)

0.7496496893257496

In [51]:
# Cluster metric: V-measure
v_measure_score(y_test, y_pred_Kmeans)

0.6775234461889525

In [59]:
# Cross-validation
print(cross_val_score(logRegModel, X_train, y_train, cv=5))

[0.95833333 1.         1.         1.         0.91666667]


## Hyperparameter tuning

In [77]:
# Grid search (with cross-validation)
params = {'tol': [0.0001, 0.0002, 0.0005],
          'C': [0.1, 0.8, 1],
          'intercept_scaling': [1, 1.5]}
grid = GridSearchCV(estimator=logRegModel,
                    param_grid=params)
grid.fit(X_train, y_train)
print(f'Best score:', grid.best_score_)
print(f'Best hyperparameters:', grid.best_estimator_)

Best score: 0.975
Best hyperparameters: LogisticRegression(C=0.8)


In [80]:
# Randomized parameter optimization (with cross-validation)
rsearch = RandomizedSearchCV(estimator=logRegModel,
                              param_distributions=params,
                              cv=5,
                              n_iter=10,
                              random_state=12)
rsearch.fit(X_train, y_train)
print('Best score:', rsearch.best_score_)
print('Best hyperparameters:', rsearch.best_estimator_)

Best score: 0.975
Best hyperparameters: LogisticRegression(C=0.8, tol=0.0002)
