In [1]:
import optuna
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset
train = pd.read_csv('train.csv')

In [3]:
# def NDVI(df):
#     NDVI_series = (df['b8'] - df['b4']) / (df['b8'] + df['b4'])
#     return NDVI_series
    
# def Moisture_index(df):
#     Moist = (df['b8_a'] - df['b11']) / (df['b8_a'] + df['b11'])
#     return Moist

def NDWI(df):
    NDWI_series = (df['b3'] - df['b8']) / (df['b3'] + df['b8'])
    return NDWI_series

def NDSI(df):
    NDSI_series = (df['b3'] - df['b11']) / (df['b3'] + df['b11'])
    return NDSI_series

def BSI(df):
    bsi_index = ((df['b11'] + df['b4']) - (df['b8'] + df['b2'])) / ((df['b11'] + df['b4']) + (df['b8'] + df['b2']))
    return bsi_index
    
def NBR1(df):
    return (df['b8'] - df['b11']) / (df['b8'] + df['b11'])

def NBR3(df):
    return (df['b8'] - df['b12']) / (df['b8'] + df['b12'])

def NBR4(df):
    return (df['b8_a'] - df['b12']) / (df['b8_a'] + df['b12'])

def AFRI1(df):
    return (df['b8'] - (0.66 * df['b11'])) / (df['b8'] + (0.66 * df['b11']))

def AFRI2(df):
    return (df['b8_a'] - (0.66 * df['b11'])) / (df['b8_a'] + (0.66 * df['b11']))

def AFRI3(df):
    return (df['b8'] - (0.66 * df['b12'])) / (df['b8'] + (0.66 * df['b12']))

def AFRI4(df):
    return (df['b8_a'] - (0.66 * df['b12'])) / (df['b8_a'] + (0.66 * df['b12']))

def BNDVI1(df):
    return (df['b8'] - df['b2']) / (df['b8'] + df['b2'])

def BNDVI2(df):
    return (df['b8_a'] - df['b2']) / (df['b8_a'] + df['b2'])

def BWDRVI1(df):
    return ((0.1 * df['b8']) - df['b2']) / ((0.1 * df['b8']) + df['b2'])

def BWDRVI2(df):
    return ((0.1 * df['b8_a']) - df['b2']) / ((0.1 * df['b8_a']) + df['b2'])

def NDVI1(df):
    return (df['b8'] - df['b4']) / (df['b8'] + df['b4'])

def NDVI2(df):
    return (df['b8_a'] - df['b4']) / (df['b8_a'] + df['b4'])

def WDRVI1(df):
    return ((0.1 * df['b8']) - df['b4']) / ((0.1 * df['b8']) + df['b4'])

def WDRVI2(df):
    return ((0.1 * df['b8_a']) - df['b4']) / ((0.1 * df['b8_a']) + df['b4'])

def SAVI1(df):
    return ((1 + 0.5) * (df['b8'] - df['b4'])) / (df['b8'] + df['b4'] + 0.5)

def SAVI2(df):
    return ((1 + 0.5) * (df['b8_a'] - df['b4'])) / (df['b8_a'] + df['b4'] + 0.5)

def GNDVI1(df):
    return (df['b8'] - df['b3']) / (df['b8'] + df['b3'])

def GNDVI2(df):
    return (df['b8_a'] - df['b3']) / (df['b8_a'] + df['b3'])

def NDRE1(df):
    return (df['b8'] - df['b5']) / (df['b8'] + df['b5'])

def NDRE2(df):
    return (df['b8'] - df['b6']) / (df['b8'] + df['b6'])

def NDRE3(df):
    return (df['b8'] - df['b7']) / (df['b8'] + df['b7'])

def NDRE4(df):
    return (df['b8_a'] - df['b5']) / (df['b8_a'] + df['b5'])

def NDRE5(df):
    return (df['b8_a'] - df['b6']) / (df['b8_a'] + df['b6'])

def NDRE6(df):
    return (df['b8_a'] - df['b7']) / (df['b8_a'] + df['b7'])

def VIgreen(df):
    return (df['b3'] - df['b4']) / (df['b3'] + df['b4'])

def CIgreen1(df):
    return (df['b8'] / df['b3']) - 1

def CIgreen2(df):
    return (df['b8_a'] / df['b3']) - 1

def CIrededge1(df):
    return (df['b8'] / df['b5']) - 1

def CIrededge2(df):
    return (df['b8'] / df['b6']) - 1

def CIrededge3(df):
    return (df['b8'] / df['b7']) - 1

def CIrededge4(df):
    return (df['b8_a'] / df['b5']) - 1

def CIrededge5(df):
    return (df['b8_a'] / df['b6']) - 1

def CIrededge6(df):
    return (df['b8_a'] / df['b7']) - 1

def CI(df):
    return (df['b4'] - df['b2']) / df['b4']

def CVI1(df):
    return df['b8'] * (df['b4'] / (df['b3']^2))

def CVI2(df):
    return df['b8_a'] * (df['b4'] / (df['b3']^2))

def CCCI1(df):
    return ((df['b8'] - df['b5']) / (df['b8'] + df['b5'])) / ((df['b8'] - df['b4']) / (df['b8'] + df['b4']))

def CCCI2(df):
    return ((df['b8'] - df['b6']) / (df['b8'] + df['b6'])) / ((df['b8'] - df['b4']) / (df['b8'] + df['b4']))

def CCCI3(df):
    return ((df['b8'] - df['b7']) / (df['b8'] + df['b7'])) / ((df['b8'] - df['b4']) / (df['b8'] + df['b4']))

def CCCI4(df):
    return ((df['b8_a'] - df['b5']) / (df['b8_a'] + df['b5'])) / ((df['b8_a'] - df['b4']) / (df['b8_a'] + df['b4']))

def CCCI5(df):
    return ((df['b8_a'] - df['b6']) / (df['b8_a'] + df['b6'])) / ((df['b8_a'] - df['b4']) / (df['b8_a'] + df['b4']))

def CCCI6(df):
    return ((df['b8_a'] - df['b7']) / (df['b8_a'] + df['b7'])) / ((df['b8_a'] - df['b4']) / (df['b8_a'] + df['b4']))

def EVI1(df):
    return 2.5 * ((df['b8'] - df['b4']) / (df['b8'] + (6 * df['b4']) - (7.5 * df['b2']) + 1))

def EVI2(df):
    return 2.5 * ((df['b8_a'] - df['b4']) / (df['b8_a'] + (6 * df['b4']) - (7.5 * df['b2']) + 1))

def GARI1(df):
    return (df['b8'] - (df['b3'] - (df['b2'] - df['b4']))) / (df['b8'] - (df['b3'] + (df['b2'] - df['b4'])))

def GARI2(df):
    return (df['b8_a'] - (df['b3'] - (df['b2'] - df['b4']))) / (df['b8_a'] - (df['b3'] + (df['b2'] - df['b4'])))

def GLI(df):
    return ((2 * df['b3']) - (df['b4'] + df['b2'])) / ((2 * df['b3']) + (df['b4'] + df['b2']))

def GBNDVI1(df):
    return (df['b8'] - (df['b3'] + df['b2'])) / (df['b8'] + (df['b3'] + df['b2']))

def GBNDVI1(df):
    return (df['b8_a'] - (df['b3'] + df['b2'])) / (df['b8_a'] + (df['b3'] + df['b2']))

def GRNDVI1(df):
    return (df['b8'] - (df['b3'] + df['b4'])) / (df['b8'] + (df['b3'] + df['b4']))

def GRNDVI2(df):
    return (df['b8_a'] - (df['b3'] + df['b4'])) / (df['b8_a'] + (df['b3'] + df['b4']))

def SLAVI1(df):
    return df['b8'] / (df['b4'] + df['b11'])

def SLAVI2(df):
    return df['b8_a'] / (df['b4'] + df['b11'])

def SLAVI3(df):
    return df['b8'] / (df['b4'] + df['b12'])

def SLAVI4(df):
    return df['b8_a'] / (df['b4'] + df['b12'])

In [4]:
# List of function names and function references
functions_name_list = [
    "NDWI", "NDSI", "BSI", 
    "NBR1", "NBR3", "NBR4", 
    "AFRI1", "AFRI2", "AFRI3", "AFRI4", 
    "BNDVI1", "BNDVI2", "BWDRVI1", "BWDRVI2", 
    "NDVI1", "NDVI2", "WDRVI1", "WDRVI2", "SAVI1", "SAVI2", "GNDVI1", 
    "GNDVI2", 
    "NDRE1", "NDRE2", "NDRE3", "NDRE4", "NDRE5", "NDRE6", "VIgreen", "CIgreen1", 
    "CIgreen2", "CIrededge1", "CIrededge2", "CIrededge3", "CIrededge4", "CIrededge5", 
    "CIrededge6", "CI", "CVI1", "CVI2", "CCCI1", "CCCI2", "CCCI3", "CCCI4", 
    "CCCI5", "CCCI6", "EVI1", "EVI2", "GARI1", "GARI2", 
    "GLI", "GBNDVI1", 
    "GBNDVI1", "GRNDVI1", "GRNDVI2", "SLAVI1", "SLAVI2", "SLAVI3", "SLAVI4"
]

functions_list = [
    NDWI, NDSI, BSI, 
    NBR1, NBR3, NBR4, 
    AFRI1, AFRI2, AFRI3, AFRI4, 
    BNDVI1, BNDVI2, BWDRVI1, BWDRVI2, 
    NDVI1, NDVI2, WDRVI1, WDRVI2, SAVI1, SAVI2, GNDVI1, 
    GNDVI2, 
    NDRE1, NDRE2, NDRE3, NDRE4, NDRE5, NDRE6, VIgreen, CIgreen1, 
    CIgreen2, CIrededge1, CIrededge2, CIrededge3, CIrededge4, CIrededge5, 
    CIrededge6, CI, CVI1, CVI2, CCCI1, CCCI2, CCCI3, CCCI4, 
    CCCI5, CCCI6, EVI1, EVI2, GARI1, GARI2, 
    GLI, GBNDVI1, 
    GBNDVI1, GRNDVI1, GRNDVI2, SLAVI1, SLAVI2, SLAVI3, SLAVI4
]

# Perform feature engineering
train_featureEng = train.copy()
for i, func in enumerate(functions_list):
    train_featureEng[functions_name_list[i]] = func(train_featureEng)

In [5]:
# Prepare data for training
X = train_featureEng.drop('nforest_type', axis=1)
y = train_featureEng['nforest_type']

# Encode target labels if necessary
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=40)

In [6]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'objective': 'multi:softmax',
        'num_class': 3,  # Number of classes
        'booster': 'gbtree',
        # 'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        # 'gamma': trial.suggest_loguniform('gamma', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    xgb_clf = XGBClassifier(**params)
    
    # Cross-validation
    cv_scores = cross_val_score(xgb_clf, X_train, y_train, cv=3, scoring='accuracy')
    
    return np.mean(cv_scores)

In [None]:
"""
Accuracy: 0.71
Best hyperparameters: {'max_depth': 10, 
                       'alpha': 2.4693579223725726, 
                       'learning_rate': 0.04105650186992039, 
                       'n_estimators': 478, 
                       'subsample': 0.8487446825233986, 
                       'colsample_bytree': 0.9498799391621067}
"""

In [8]:
# Create the Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params

# Train the final model with the best hyperparameters
xgb_clf = XGBClassifier(**best_params)
xgb_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Best hyperparameters:', best_params)

[I 2024-06-06 02:46:01,323] A new study created in memory with name: no-name-9acfa3e7-bb4f-4f90-9500-4092410ec0a5
[I 2024-06-06 02:46:21,839] Trial 0 finished with value: 0.6773614234250299 and parameters: {'max_depth': 5, 'alpha': 6.349277526713798, 'learning_rate': 0.038814153258011366, 'n_estimators': 543, 'subsample': 0.8259102949972708, 'colsample_bytree': 0.6280710894270878, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.6773614234250299.
[I 2024-06-06 02:46:45,170] Trial 1 finished with value: 0.6805217926323633 and parameters: {'max_depth': 7, 'alpha': 3.061296118411001, 'learning_rate': 0.09618859944610933, 'n_estimators': 559, 'subsample': 0.6030658064829355, 'colsample_bytree': 0.8188016201230148, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.6805217926323633.
[I 2024-06-06 02:46:54,618] Trial 2 finished with value: 0.6432679702952316 and parameters: {'max_depth': 3, 'alpha': 8.780353559629228, 'learning_rate': 0.005155805350868427, 'n_estimators': 7

Number of completed trials: 100
Accuracy: 0.71
Best hyperparameters: {'max_depth': 9, 'alpha': 0.4683674661936345, 'learning_rate': 0.012906764390309008, 'n_estimators': 977, 'subsample': 0.7599833624472752, 'colsample_bytree': 0.9592127445770873, 'grow_policy': 'depthwise'}


In [16]:
best_params =  {'max_depth': 9, 
                'alpha': 0.4683674661936345, 
                'learning_rate': 0.012906764390309008, 
                'n_estimators': 977, 
                'subsample': 0.7599833624472752, 
                'colsample_bytree': 0.9592127445770873, 
                'grow_policy': 'depthwise'}

# Train the final model with the best hyperparameters
xgb_clf = XGBClassifier(**best_params)
xgb_clf.fit(X_train, y_train)

In [17]:
# Ensure the model has been fitted
if hasattr(xgb_clf, 'get_booster'):
    print("Model is fitted.")
else:
    print("Model is not fitted.")

Model is fitted.


In [None]:
"""
Accuracy: 0.71
Best hyperparameters: {'max_depth': 9, 
                       'alpha': 0.4683674661936345, 
                       'learning_rate': 0.012906764390309008, 
                       'n_estimators': 977, 
                       'subsample': 0.7599833624472752, 
                       'colsample_bytree': 0.9592127445770873, 
                       'grow_policy': 'depthwise'}
"""

In [19]:
# Load test dataset
test_df = pd.read_csv('test.csv')

# Apply feature engineering to the test set
test_featureEng = test_df.copy()
for i, func in enumerate(functions_list):
    test_featureEng[functions_name_list[i]] = func(test_featureEng)

# Make predictions on the test set
test_predictions = xgb_clf.predict(test_featureEng)

# Convert numeric predictions back to original string labels
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Create a DataFrame for the predictions
output = pd.DataFrame({'id': test_df.index, 'nforest_type': test_predictions_labels})
output

Unnamed: 0,id,nforest_type
0,0,DEF
1,1,MDF
2,2,MDF
3,3,DDF
4,4,MDF
...,...,...
3995,3995,DDF
3996,3996,MDF
3997,3997,DEF
3998,3998,MDF


In [12]:
# Load sample submission file
sample_submission = pd.read_csv('/kaggle/input/forest-type-classification-spai/sample_submission.csv')

# Create submission DataFrame
submission = pd.DataFrame({'id': sample_submission['id'], 'nforest_type': test_predictions_labels})
submission

# # Save the submission DataFrame to a CSV file
submission.to_csv('feature(an)-hyperparameters-cv-2.csv', index=False)
submission

Unnamed: 0,id,nforest_type
0,13467,DEF
1,12719,MDF
2,1054,MDF
3,13747,DDF
4,9453,MDF
...,...,...
3995,115,MDF
3996,10654,MDF
3997,5718,DEF
3998,13054,MDF


In [20]:
# Load sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Create submission DataFrame
submission = pd.DataFrame({'id': sample_submission['id'], 'nforest_type': test_predictions_labels})
submission

# # Save the submission DataFrame to a CSV file
submission.to_csv('feature(an)-hyperparameters-cv-2.csv', index=False)
submission

Unnamed: 0,id,nforest_type
0,13467,DEF
1,12719,MDF
2,1054,MDF
3,13747,DDF
4,9453,MDF
...,...,...
3995,115,DDF
3996,10654,MDF
3997,5718,DEF
3998,13054,MDF
