In [83]:
import pandas as pd
import optuna
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load training dataset
train = pd.read_csv('/kaggle/input/forest-type-classification-spai/train.csv')

# Features and target
X = train.drop('nforest_type', axis=1)
y = train['nforest_type']

In [86]:
# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=40)

# Define the objective function for Optuna
def objective(trial):
    param = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
    }

    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

In [87]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

[I 2024-06-03 13:42:07,118] A new study created in memory with name: no-name-1b62de6c-5ec3-4828-b227-7df1ccaa16da
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
[I 2024-06-03 13:42:08,396] Trial 0 finished with value: 0.6648793565683646 and parameters: {'max_depth': 15, 'learning_rate': 0.004563167694848605, 'n_estimators': 52, 'gamma': 1.4697287634363547e-07, 'min_child_weight': 10, 'subsample': 0.5322738270143645, 'colsample_bytree': 0.8025648630815294, 'alpha': 0.0012390461213570058, 'lambda': 6.0950422259769865e-05}. Best is trial 0 with value: 0.6648793565683646.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),
  'gamma': tr

Best hyperparameters:  {'max_depth': 9, 'learning_rate': 0.053315078266350784, 'n_estimators': 391, 'gamma': 0.0017250476222867605, 'min_child_weight': 4, 'subsample': 0.8973067573902883, 'colsample_bytree': 0.984662961762766, 'alpha': 1.7769265331564559, 'lambda': 2.7743277275383336e-08}


In [89]:
study.best_params

{'max_depth': 9,
 'learning_rate': 0.053315078266350784,
 'n_estimators': 391,
 'gamma': 0.0017250476222867605,
 'min_child_weight': 4,
 'subsample': 0.8973067573902883,
 'colsample_bytree': 0.984662961762766,
 'alpha': 1.7769265331564559,
 'lambda': 2.7743277275383336e-08}

In [90]:
# Train the final model using the best hyperparameters
best_params = study.best_params
best_params['objective'] = 'multi:softmax'
best_params['num_class'] = 3

final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Load test dataset
test = pd.read_csv('/kaggle/input/forest-type-classification-spai/test.csv')

# Drop the 'id' column to match training data features
# X_test_final = test.drop('id', axis=1)

# Make predictions on the test set
test_predictions = final_model.predict(test)

# Convert numeric predictions back to original string labels
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Load sample submission file
sample_submission = pd.read_csv('/kaggle/input/forest-type-classification-spai/sample_submission.csv')

# Create submission DataFrame
submission = pd.DataFrame({'id': test['id'], 'nforest_type': test_predictions_labels})

In [91]:
submission

Unnamed: 0,id,nforest_type
0,13467,DEF
1,12719,MDF
2,1054,MDF
3,13747,DDF
4,9453,DEF
...,...,...
3995,115,MDF
3996,10654,MDF
3997,5718,DDF
3998,13054,MDF


In [92]:
# # Save the submission DataFrame to a CSV file
submission.to_csv('submission-XGBoost+Hparameter.csv', index=False)