In [1]:
import os, zipfile, warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from skimage.feature import hog

warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [2]:
with np.load("mnist.npz") as f:
    X_train_full, y_train_full = f["x_train"], f["y_train"]
    X_test, y_test = f["x_test"], f["y_test"]

if X_train_full.ndim == 2:
    X_train_full = X_train_full.reshape((-1, 28, 28))
if X_test.ndim == 2:
    X_test = X_test.reshape((-1, 28, 28))

X_train_full = X_train_full.astype(np.uint8)
X_test = X_test.astype(np.uint8)
y_train_full = y_train_full.astype(int)
y_test = y_test.astype(int)

In [3]:
def compute_hog_features(images, ppc=(4,4), cpb=(1,1)):
    feats = []
    for img in tqdm(images, desc="HOG"):
        feats.append(hog(img, pixels_per_cell=ppc, cells_per_block=cpb, feature_vector=True))
    return np.array(feats)

X_train_hog = compute_hog_features(X_train_full)
X_test_hog = compute_hog_features(X_test)

HOG:  50%|█████     | 30280/60000 [00:11<00:11, 2619.19it/s]


KeyboardInterrupt: 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_hog)
X_test_scaled = scaler.transform(X_test_hog)

pca = PCA(n_components=0.95, svd_solver="full", random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_pca, y_train_full, test_size=0.15,
    stratify=y_train_full, random_state=RANDOM_STATE
)
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

subsample = 8000
idx = np.random.choice(len(X_tr), subsample, replace=False)
X_sub, y_sub = X_tr[idx], y_tr[idx]

In [None]:
def run_random_search(pipe, params, X, y, n_iter=4, cv=cv, scoring="accuracy"):
    rs = RandomizedSearchCV(
        pipe, params, n_iter=n_iter, cv=cv,
        scoring=scoring, n_jobs=-1, random_state=RANDOM_STATE, verbose=1
    )
    rs.fit(X, y)
    return rs

In [None]:
results = {}

print("RF...")
pipe_rf = Pipeline([("clf", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))])
params_rf = {"clf__n_estimators": [100, 200], "clf__max_depth": [20, None]}
results["rf"] = run_random_search(pipe_rf, params_rf, X_sub, y_sub, n_iter=4)

print("GB...")
pipe_gb = Pipeline([("clf", GradientBoostingClassifier(random_state=RANDOM_STATE))])
params_gb = {"clf__n_estimators": [100, 200], "clf__learning_rate": [0.1, 0.05]}
results["gb"] = run_random_search(pipe_gb, params_gb, X_sub, y_sub, n_iter=4)

try:
    import xgboost as xgb
    print("XGB...")
    pipe_xgb = Pipeline([("clf", xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss",
                                                   random_state=RANDOM_STATE, n_jobs=-1))])
    params_xgb = {"clf__n_estimators": [100, 200], "clf__max_depth": [3, 5], "clf__learning_rate": [0.1, 0.05]}
    results["xgb"] = run_random_search(pipe_xgb, params_xgb, X_sub, y_sub, n_iter=4)
except Exception as e:
    print("XGBoost недоступен:", e)

RF...
Fitting 2 folds for each of 4 candidates, totalling 8 fits
GB...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


KeyboardInterrupt: 

In [None]:
best_models = {name: rs.best_estimator_ for name, rs in results.items() if rs is not None}

accs = {name: float(accuracy_score(y_val, model.predict(X_val))) for name, model in best_models.items()}
for name, acc in accs.items():
    print(f"{name}: val acc = {acc:.4f}")

best_name, best_acc = max(accs.items(), key=lambda kv: kv[1])
print("Лучший на валидации:", best_name, best_acc)

estimators = [(n, m) for n,m in best_models.items()]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=2000, solver="lbfgs", multi_class="multinomial"),
    n_jobs=-1
)
stack.fit(X_tr, y_tr)
print("Stack val acc:", accuracy_score(y_val, stack.predict(X_val)))

In [None]:
y_test_pred = stack.predict(X_test_pca)
print("TEST acc:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

submission = pd.DataFrame({"index": np.arange(len(y_test_pred)), "label": y_test_pred})
submission.to_csv("submission.csv", index=False)

os.makedirs("models", exist_ok=True)
joblib.dump({"scaler": scaler, "pca": pca, "stack": stack, "best_models": best_models}, "models/pipeline.pkl")

with zipfile.ZipFile("seminar01.zip", "w", zipfile.ZIP_DEFLATED) as z:
    z.write("submission.csv")
    for root, _, files in os.walk("models"):
        for fn in files:
            z.write(os.path.join(root, fn))

print("seminar01.zip готов ✅ (отправь боту с caption=seminar01)")