In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

data = pd.read_csv('data/train.csv')

# 2. Разделение данных на признаки (X) и целевую переменную (y)
X = data.drop('Cover_Type', axis=1)  
y = data['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# 4. Выбор модели
model = RandomForestClassifier(random_state=42, n_jobs=-1)

In [6]:
data_ = data

a = []
for i in range(len(data)):
    for x in range(1, 5):
        if data["Wilderness_Area" + str(x)][i]:
            a.append(x-1)

for x in range(1, 5):
    data_ = data_.drop("Wilderness_Area" + str(x), axis=1)

data_['Wilderness_Area'] = a

a.clear()
for i in range(len(data)):
    for x in range(1, 41):
        if data["Soil_Type" + str(x)][i]:
            a.append(x-1)

for x in range(1, 41):
    data_ = data_.drop("Soil_Type" + str(x), axis=1)

data_['Soil_Type'] = a

In [7]:
X_new = data_.drop('Cover_Type', axis=1)  
X_new_train, X_new_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 


In [18]:

# 6. Обучение модели
model.fit(X_train, y_train)
score = model.score(X_test, y_test)


In [19]:

# 5. Кросс-валидация
# Используем 5-кратную кросс-валидацию
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)



In [20]:

print(f'Test accuracy: {score:.4f}')


# Вывод результатов кросс-валидации
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores):.4f}')
print(f'Max CV accuracy: {np.max(cv_scores):.4f}')
print(f'Min CV accuracy: {np.min(cv_scores):.4f}')



Test accuracy: 0.8753
Cross-validation scores: [0.85289256 0.86771393 0.86027284 0.85407193 0.86068623]
Mean CV accuracy: 0.8591
Max CV accuracy: 0.8677
Min CV accuracy: 0.8529


In [21]:

param_grid = {
    'n_estimators': range(50, 201, 50),
    'max_depth': [10, 35, 60, 85, 110],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [2, 3, 4, 5]
}
""""""

# Настройка RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                                   n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, 
                                   random_state=42)

# Обучение модели
random_search.fit(X_train, y_train)

# Лучшие гиперпараметры
print("Лучшие гиперпараметры:", random_search.best_params_)

# Оценка на тестовой выборке
best_model = random_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Тестовая точность:", test_accuracy)


Лучшие гиперпараметры: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 60}
Тестовая точность: 0.8703703703703703


In [22]:
best_model = RandomForestClassifier(**random_search.best_params_)
best_model.fit(X_train, y_train)
test_accuracy = best_model.score(X_test, y_test)
print("Тестовая точность:", test_accuracy)

Тестовая точность: 0.8707010582010583


In [23]:
cv_scores_best = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')

# Вывод результатов кросс-валидации
print(f'Cross-validation scores: {cv_scores_best}')
print(f'Mean CV accuracy: {np.mean(cv_scores_best):.4f}')
print(f'Max CV accuracy: {np.max(cv_scores_best):.4f}')
print(f'Min CV accuracy: {np.min(cv_scores_best):.4f}')


Cross-validation scores: [0.84338843 0.85985945 0.85241835 0.84125672 0.8495246 ]
Mean CV accuracy: 0.8493
Max CV accuracy: 0.8599
Min CV accuracy: 0.8413


In [24]:
test_accuracy = best_model.score(X_test, y_test)
print("Тестовая точность:", test_accuracy)

Тестовая точность: 0.8707010582010583


In [30]:
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

# Создание модели градиентного бустинга
"""model = HistGradientBoostingClassifier(
    n_estimators=100,  # Количество деревьев
    learning_rate=0.2,  # Скорость обучения
    max_depth=3,       # Максимальная глубина каждого дерева
    random_state=42, 
    n_jobs=-1
)"""

param_grid = {
    'learning_rate': list(np.arange(0.01, 0.11, 0.02)),
    'max_iter': range(50, 301, 50),
    'max_leaf_nodes': range(16, 41, 4),
    'max_depth': [None]+list(range(10, 31, 5))
}
""""""

# Настройка RandomizedSearchCV
model = HistGradientBoostingClassifier(random_state=42) 

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                                   n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, 
                                   random_state=42)

random_search.fit(X_train, y_train)

# Лучшие гиперпараметры
print("Лучшие гиперпараметры:", random_search.best_params_)

# Оценка на тестовой выборке
best_model = random_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Тестовая точность:", test_accuracy)



Лучшие гиперпараметры: {'max_leaf_nodes': 40, 'max_iter': 250, 'max_depth': 15, 'learning_rate': np.float64(0.08999999999999998)}
Тестовая точность: 0.8839285714285714


In [13]:
from sklearn.svm import SVC

models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVC': SVC()
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name}: Средняя точность = {scores.mean()}")


RandomForest: Средняя точность = 0.8592102808687423
GradientBoosting: Средняя точность = 0.8073742991947359
SVC: Средняя точность = 0.5470402358737133


In [14]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(estimators=[
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('svc', SVC(probability=True))
], voting='soft')

scores = cross_val_score(ensemble_model, X_train, y_train, cv=5)
print("Средняя точность ансамбля:", scores.mean())

Средняя точность ансамбля: 0.8278778882059727
