In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, KFold, GroupKFold, TimeSeriesSplit,
    LeaveOneOut, cross_val_score, cross_validate,
    GridSearchCV
)

from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer,
    PowerTransformer, QuantileTransformer, KBinsDiscretizer,
    OneHotEncoder, OrdinalEncoder, LabelEncoder,
    PolynomialFeatures
)

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.feature_selection import (
    SelectKBest, chi2, f_classif, f_regression, mutual_info_classif,
    SelectFromModel, RFE, RFECV, VarianceThreshold
)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import (
    PCA, TruncatedSVD, NMF, FastICA, SparsePCA, IncrementalPCA
)
from sklearn.manifold import TSNE
from sklearn.cluster import FeatureAgglomeration
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, Lars, LassoLars,
    SGDRegressor, SGDClassifier, LogisticRegression,
    PassiveAggressiveClassifier, PassiveAggressiveRegressor,
    HuberRegressor, RANSACRegressor, TheilSenRegressor,
    Perceptron, QuantileRegressor
)

from sklearn.svm import (
    SVC, SVR, LinearSVC, LinearSVR, OneClassSVM, NuSVC, NuSVR
)

from sklearn.neighbors import (
    KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors,
    RadiusNeighborsClassifier, RadiusNeighborsRegressor,
    LocalOutlierFactor, NeighborhoodComponentsAnalysis
)

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    HistGradientBoostingClassifier, HistGradientBoostingRegressor,
    AdaBoostClassifier, AdaBoostRegressor,
    BaggingClassifier, BaggingRegressor,
    VotingClassifier, VotingRegressor,
    StackingClassifier, StackingRegressor,
    IsolationForest
)

from sklearn.cluster import (
    KMeans, MiniBatchKMeans, DBSCAN, OPTICS, MeanShift,
    AgglomerativeClustering, SpectralClustering,
    AffinityPropagation, Birch
)
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.semi_supervised import LabelPropagation, LabelSpreading, SelfTrainingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score, fbeta_score,
    confusion_matrix, classification_report,
    roc_auc_score, roc_curve, precision_recall_curve,
    average_precision_score,
    silhouette_score, davies_bouldin_score,
    mean_squared_error, mean_absolute_error, r2_score,
    log_loss, brier_score_loss
)
from sklearn.utils import class_weight, resample, shuffle
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN

In [6]:
# Демонстрационный вариант
import numpy as np
import pandas as pd
from sklearn.datasets import (
    load_diabetes,
    load_breast_cancer,
    load_wine,
    load_digits,
)

# 1. Линейная регрессия на diabetes
diabetes = load_diabetes()
X_d, y_d = diabetes.data, diabetes.target
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_d, y_d, test_size=0.25, random_state=42
)

linreg = LinearRegression()
linreg.fit(X_train_d, y_train_d)
r2 = r2_score(y_test_d, linreg.predict(X_test_d))
print("1) R² (diabetes) =", round(r2, 3))

# 2. F1-score KNN на breast_cancer
bc = load_breast_cancer()
X_bc, y_bc = bc.data, bc.target
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(
    X_bc, y_bc, test_size=0.25, random_state=42, stratify=y_bc
)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_bc, y_train_bc)
f1 = f1_score(y_test_bc, knn.predict(X_test_bc))
print("2) F1 (breast cancer, KNN) =", round(f1, 3))

# 3. ROC AUC логистической регрессии
logreg = LogisticRegression(max_iter=10_000)
logreg.fit(X_train_bc, y_train_bc)
proba = logreg.predict_proba(X_test_bc)[:, 1]
roc = roc_auc_score(y_test_bc, proba)
print("3) ROC AUC (breast cancer, LogReg) =", round(roc, 3))

# 4. SelectKBest на wine (k = 3)
wine = load_wine()
X_w, y_w = wine.data, wine.target
skb = SelectKBest(score_func=f_classif, k=3).fit(X_w, y_w)
sel_idx = skb.get_support(indices=True)
sel_features = [wine.feature_names[i] for i in sel_idx]
print("4) Лучшие признаки (wine) =", ", ".join(sel_features))

# 5. Pipeline: StandardScaler → PCA(3) → LogReg
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(
    X_w, y_w, test_size=0.25, random_state=42, stratify=y_w
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=3)),
        ("logreg", LogisticRegression(max_iter=10_000)),
    ]
)
pipe.fit(X_train_w, y_train_w)
acc = accuracy_score(y_test_w, pipe.predict(X_test_w))
print("5) Accuracy (wine pipeline) =", round(acc, 3))

# 6. RandomForest + GridSearchCV
param_grid = {"n_estimators": [10, 100], "max_depth": [3, 5]}
rf = RandomForestClassifier(random_state=42)
gs = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
gs.fit(X_train_bc, y_train_bc)
print("6) Лучший mean CV-score (RF) =", round(gs.best_score_, 3))


# 7. Кластеризация digits: PCA(2) → KMeans(10)
digits = load_digits()
X_digits = digits.data

pca2 = PCA(n_components=2, random_state=42)
X_digits_pca = pca2.fit_transform(X_digits)

kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit_predict(X_digits_pca)
cluster0 = np.sum(labels == 0)
print("7) Число точек в кластере 0 (digits) =", cluster0)

1) R² (diabetes) = 0.485
2) F1 (breast cancer, KNN) = 0.945
3) ROC AUC (breast cancer, LogReg) = 0.996
4) Лучшие признаки (wine) = flavanoids, od280/od315_of_diluted_wines, proline
5) Accuracy (wine pipeline) = 0.956
6) Лучший mean CV-score (RF) = 0.955
7) Число точек в кластере 0 (digits) = 228
