In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("forest_covertype.csv")
data = data.drop_duplicates()

num_cols = data.select_dtypes(include=[np.number]).columns

for col in num_cols:
    if col == 'Cover_Type':
        continue
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower) & (data[col] <= upper)]

corr = data.corr()

upper_triangle = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

to_drop = [
    column for column in upper_triangle.columns
    if any(abs(upper_triangle[column]) > 0.9)
]

data = data.drop(columns=to_drop)
obj_cols = data.select_dtypes(include=['object']).columns

for col in obj_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    
X = data.drop("Cover_Type", axis=1)
y = data["Cover_Type"]
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=7)),
    ('nb', GaussianNB())
]

meta_model = LogisticRegression(max_iter=500)

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_model,
    passthrough=True
)

stack_model.fit(X_train, y_train)
y_prob = stack_model.predict_proba(X_test)
rocauc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("Использованные удалённые коррелированные признаки:", to_drop)
print("ROC-AUC:", rocauc)

Использованные удалённые коррелированные признаки: ['Wilderness_Area4']
ROC-AUC: 0.951227905399138
