In [7]:
import kagglehub
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from joblib import dump
from utils import format_param_grid

In [8]:
# Download latest version
path = kagglehub.dataset_download("fmena14/volcanoesvenus")
print("Path to dataset files:", path)

Path to dataset files: /home/alessio/.cache/kagglehub/datasets/fmena14/volcanoesvenus/versions/1


In [9]:
train_dir = os.path.join(path, 'volcanoes_train')
train_images_path = os.path.join(train_dir, 'train_images.csv')
train_labels_path = os.path.join(train_dir, 'train_labels.csv')

train_images = pd.read_csv(train_images_path, header=None)
train_labels = pd.read_csv(train_labels_path)

train_images_scaled = train_images.values / 255 # normalize pixel values to [0,1]

In [10]:
X_train, y_train = train_images_scaled, train_labels['Volcano?']

In [11]:
'''Decision Tree'''
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 8, 9, 10, 11, 12, 13, 14, 15, 20, 30, 35, 40, 60, 80, 160, None]#[3, 5, 10, 20, 40, 60, 80, 160, None]
}

# Initialize the decision tree classifier
dt = DecisionTreeClassifier(random_state=2024)

# Initialize GridSearchCV
dt_grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=3)

# Perform the grid search
dt_grid_search.fit(X_train, y_train)

# Save data
results_dt = pd.DataFrame(dt_grid_search.cv_results_)
# results_dt.to_csv(f"../grid_search_data/dt_grid_search_{format_param_grid(param_grid)}.csv", index=False)
results_dt.to_csv(f"../grid_search_data/dt_grid_search_lineplot.csv", index=False)


# Best hyperparameters
print("Best hyperparameters:", dt_grid_search.best_params_)

# Use the best model to make predictions
best_dt_model = dt_grid_search.best_estimator_
best_accuracy_dt = dt_grid_search.best_score_
print(best_accuracy_dt)

# Save model
# dump(best_dt_model, f'../models/best_decision_tree_model_val_acc_{best_accuracy_dt* 100:.2f}.joblib')

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .......................max_depth=3;, score=0.884 total time=   7.5s
[CV 2/5] END .......................max_depth=3;, score=0.899 total time=   7.8s
[CV 3/5] END .......................max_depth=3;, score=0.907 total time=   7.3s
[CV 4/5] END .......................max_depth=3;, score=0.903 total time=   7.7s
[CV 5/5] END .......................max_depth=3;, score=0.908 total time=   7.4s
[CV 1/5] END .......................max_depth=5;, score=0.900 total time=  12.4s
[CV 2/5] END .......................max_depth=5;, score=0.894 total time=  12.0s
[CV 3/5] END .......................max_depth=5;, score=0.898 total time=  12.2s
[CV 4/5] END .......................max_depth=5;, score=0.901 total time=  13.0s
[CV 5/5] END .......................max_depth=5;, score=0.903 total time=  12.2s
[CV 1/5] END .......................max_depth=8;, score=0.899 total time=  19.2s
[CV 2/5] END .......................max_depth=8;

In [9]:
'''Decision Tree zoomed'''
# Define the parameter grid
param_grid = {
    'max_depth': [12, 13, 14, 15]}

# Initialize the decision tree classifier
dt = DecisionTreeClassifier(random_state=2024)

# Initialize GridSearchCV
dt_grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=3)

# Perform the grid search
dt_grid_search.fit(X_train, y_train)

# Save data
results_dt = pd.DataFrame(dt_grid_search.cv_results_)
results_dt.to_csv(f"../grid_search_data/dt_zoomed_grid_search_{format_param_grid(param_grid)}.csv", index=False)

# Best hyperparameters
print("Best hyperparameters:", dt_grid_search.best_params_)

# Use the best model to make predictions
best_dt_model = dt_grid_search.best_estimator_
best_accuracy_dt = dt_grid_search.best_score_
print(best_accuracy_dt)

# Save model
dump(best_dt_model, f'../models/best_decision_tree_model_val_acc_{best_accuracy_dt* 100:.2f}.joblib')

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ......................max_depth=12;, score=0.904 total time=  26.4s
[CV 2/5] END ......................max_depth=12;, score=0.909 total time=  25.6s
[CV 3/5] END ......................max_depth=12;, score=0.927 total time=  25.5s
[CV 4/5] END ......................max_depth=12;, score=0.912 total time=  25.8s
[CV 5/5] END ......................max_depth=12;, score=0.916 total time=  26.1s
[CV 1/5] END ......................max_depth=13;, score=0.895 total time=  29.0s
[CV 2/5] END ......................max_depth=13;, score=0.911 total time=  28.1s
[CV 3/5] END ......................max_depth=13;, score=0.922 total time=  27.9s
[CV 4/5] END ......................max_depth=13;, score=0.916 total time=  28.2s
[CV 5/5] END ......................max_depth=13;, score=0.919 total time=  28.4s
[CV 1/5] END ......................max_depth=14;, score=0.896 total time=  30.3s
[CV 2/5] END ......................max_depth=14;,

['../models/best_decision_tree_model_val_acc_91.37.joblib']

In [13]:
'''Bagging'''
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 400], 
    'estimator__max_depth': [3, 5, 10, 20, 40, 60]  

}

# Create the base decision tree model
dt_model = DecisionTreeClassifier(random_state=2024)

# Create the bagging model
bagging_model = BaggingClassifier(estimator=dt_model, random_state=2024, n_jobs=-1)

# Initialize GridSearchCV
cv = 5
grid_search_bagging = GridSearchCV(estimator=bagging_model, param_grid=param_grid, 
                           scoring='accuracy', cv=cv, verbose=3)

# Perform the grid search
grid_search_bagging.fit(X_train, y_train)

# Save data
results_bagging = pd.DataFrame(grid_search_bagging.cv_results_)
results_bagging.to_csv(f"../grid_search_data/bagging_grid_search_cv{cv}_{format_param_grid(param_grid)}.csv", index=False)

# Best hyperparameters
print("Best hyperparameters:", grid_search_bagging.best_params_)

# Use the best model to make predictions
best_bagging_model = grid_search_bagging.best_estimator_
best_accuracy_bagging = grid_search_bagging.best_score_
print(best_accuracy_bagging)

# Best hyperparameters
print("Best hyperparameters:", grid_search_bagging.best_params_)

# Save model
dump(best_bagging_model, f'../models/best_bagging_model_val_acc_{best_accuracy_bagging* 100:.2f}.joblib')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END estimator__max_depth=3, n_estimators=50;, score=0.897 total time=  29.6s
[CV 2/5] END estimator__max_depth=3, n_estimators=50;, score=0.906 total time=  29.0s
[CV 3/5] END estimator__max_depth=3, n_estimators=50;, score=0.910 total time=  28.3s
[CV 4/5] END estimator__max_depth=3, n_estimators=50;, score=0.906 total time=  29.1s
[CV 5/5] END estimator__max_depth=3, n_estimators=50;, score=0.905 total time=  28.3s
[CV 1/5] END estimator__max_depth=3, n_estimators=100;, score=0.899 total time=  51.7s
[CV 2/5] END estimator__max_depth=3, n_estimators=100;, score=0.906 total time=  51.4s
[CV 3/5] END estimator__max_depth=3, n_estimators=100;, score=0.911 total time=  52.1s
[CV 4/5] END estimator__max_depth=3, n_estimators=100;, score=0.906 total time=  51.6s
[CV 5/5] END estimator__max_depth=3, n_estimators=100;, score=0.906 total time=  52.2s
[CV 1/5] END estimator__max_depth=3, n_estimators=200;, score=0.897 total

['../models/best_bagging_model_val_acc_92.96.joblib']

In [14]:
'''Bagging zoomed'''
# Define the parameter grid
param_grid = {
    'n_estimators': [300, 500], 
    'estimator__max_depth': [10, 20, 40]  

}

# Create the base decision tree model
dt_model = DecisionTreeClassifier(random_state=2024)

# Create the bagging model
bagging_model = BaggingClassifier(estimator=dt_model, random_state=2024, n_jobs=-1)

# Initialize GridSearchCV
cv = 5
grid_search_bagging = GridSearchCV(estimator=bagging_model, param_grid=param_grid, 
                           scoring='accuracy', cv=cv, verbose=3)

# Perform the grid search
grid_search_bagging.fit(X_train, y_train)

# Save data
results_bagging = pd.DataFrame(grid_search_bagging.cv_results_)
results_bagging.to_csv(f"../grid_search_data/bagging_zoomed_grid_search_cv{cv}_{format_param_grid(param_grid)}.csv", index=False)

# Best hyperparameters
print("Best hyperparameters:", grid_search_bagging.best_params_)

# Use the best model to make predictions
best_bagging_model = grid_search_bagging.best_estimator_
best_accuracy_bagging = grid_search_bagging.best_score_
print(best_accuracy_bagging)

# Best hyperparameters
print("Best hyperparameters:", grid_search_bagging.best_params_)

# Save model
dump(best_bagging_model, f'../models/best_bagging_model_val_acc_{best_accuracy_bagging* 100:.2f}.joblib')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END estimator__max_depth=10, n_estimators=300;, score=0.917 total time= 6.8min
[CV 2/5] END estimator__max_depth=10, n_estimators=300;, score=0.924 total time= 6.8min
[CV 3/5] END estimator__max_depth=10, n_estimators=300;, score=0.929 total time= 6.8min
[CV 4/5] END estimator__max_depth=10, n_estimators=300;, score=0.923 total time= 6.8min
[CV 5/5] END estimator__max_depth=10, n_estimators=300;, score=0.924 total time= 6.7min
[CV 1/5] END estimator__max_depth=10, n_estimators=500;, score=0.916 total time=11.2min
[CV 2/5] END estimator__max_depth=10, n_estimators=500;, score=0.926 total time=11.1min
[CV 3/5] END estimator__max_depth=10, n_estimators=500;, score=0.928 total time=11.2min
[CV 4/5] END estimator__max_depth=10, n_estimators=500;, score=0.924 total time=11.2min
[CV 5/5] END estimator__max_depth=10, n_estimators=500;, score=0.923 total time=11.2min
[CV 1/5] END estimator__max_depth=20, n_estimators=300;, sco

['../models/best_bagging_model_val_acc_92.90.joblib']

In [12]:
'''Random Forest'''
# Define the parameter grid
param_grid = {
    'n_estimators': [175],#[50, 100, 200, 400, 800],
    'max_depth': [3, 5, 8, 9, 10, 11, 12, 13, 14, 15, 20, 30, 35, 40, 60, 80, 160, None]#[3, 5, 10, 20, 40, 60, 80, 160, None]      
}

# Create the random forest model
rf_model = RandomForestClassifier(random_state=2024, n_jobs=-1)

# Initialize GridSearchCV
cv = 5
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='accuracy', cv=cv, verbose=3)

# Perform the grid search
grid_search_rf.fit(X_train, y_train)

# Save data
results_rf = pd.DataFrame(grid_search_rf.cv_results_)
# results_rf.to_csv(f"../grid_search_data/rf_grid_search_cv{cv}_{format_param_grid(param_grid)}.csv", index=False)
results_rf.to_csv(f"../grid_search_data/rf_grid_search_lineplot_.csv", index=False)


# Best hyperparameters
print("Best hyperparameters:", grid_search_rf.best_params_)

# Use the best model to make predictions
best_rf_model = grid_search_rf.best_estimator_
best_accuracy_rf = grid_search_rf.best_score_
print(best_accuracy_rf)

# Best hyperparameters
print("Best hyperparameters:", grid_search_rf.best_params_)

# Save model
# dump(best_rf_model, f'../models/best_rf_model_val_acc_{best_accuracy_rf* 100:.2f}.joblib')

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .....max_depth=3, n_estimators=175;, score=0.859 total time=   0.9s
[CV 2/5] END .....max_depth=3, n_estimators=175;, score=0.857 total time=   0.9s
[CV 3/5] END .....max_depth=3, n_estimators=175;, score=0.858 total time=   0.9s
[CV 4/5] END .....max_depth=3, n_estimators=175;, score=0.861 total time=   0.9s
[CV 5/5] END .....max_depth=3, n_estimators=175;, score=0.860 total time=   0.9s
[CV 1/5] END .....max_depth=5, n_estimators=175;, score=0.877 total time=   1.3s
[CV 2/5] END .....max_depth=5, n_estimators=175;, score=0.879 total time=   1.3s
[CV 3/5] END .....max_depth=5, n_estimators=175;, score=0.876 total time=   1.3s
[CV 4/5] END .....max_depth=5, n_estimators=175;, score=0.877 total time=   1.3s
[CV 5/5] END .....max_depth=5, n_estimators=175;, score=0.876 total time=   1.3s
[CV 1/5] END .....max_depth=8, n_estimators=175;, score=0.900 total time=   1.9s
[CV 2/5] END .....max_depth=8, n_estimators=175;

In [None]:
'''Random Forest zoom'''
# Define the parameter grid
param_grid = {
    'n_estimators': [150, 175, 200, 225, 250, 275, 300],
    'max_depth': [25, 30 ,35, 40, 45, 50, 55, 60]      
}

# Create the random forest model
rf_model = RandomForestClassifier(random_state=2024, n_jobs=-1)

# Initialize GridSearchCV
cv = 5
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='accuracy', cv=cv, verbose=3)

# Perform the grid search
grid_search_rf.fit(X_train, y_train)

# Save data
results_rf = pd.DataFrame(grid_search_rf.cv_results_)
results_rf.to_csv(f"../grid_search_data/rf_zoomed_grid_search_cv{cv}_{format_param_grid(param_grid)}.csv", index=False)

# Best hyperparameters
print("Best hyperparameters:", grid_search_rf.best_params_)

# Use the best model to make predictions
best_rf_model = grid_search_rf.best_estimator_
best_accuracy_rf = grid_search_rf.best_score_
print(best_accuracy_rf)

# Best hyperparameters
print("Best hyperparameters:", grid_search_rf.best_params_)

# Save model
dump(best_rf_model, f'../models/best_rf_model_val_acc_{best_accuracy_rf* 100:.2f}.joblib')

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV 1/5] END ....max_depth=25, n_estimators=150;, score=0.920 total time=   3.8s
[CV 2/5] END ....max_depth=25, n_estimators=150;, score=0.919 total time=   4.0s
[CV 3/5] END ....max_depth=25, n_estimators=150;, score=0.921 total time=   4.1s
[CV 4/5] END ....max_depth=25, n_estimators=150;, score=0.916 total time=   4.0s
[CV 5/5] END ....max_depth=25, n_estimators=150;, score=0.913 total time=   4.1s
[CV 1/5] END ....max_depth=25, n_estimators=175;, score=0.918 total time=   4.7s
[CV 2/5] END ....max_depth=25, n_estimators=175;, score=0.918 total time=   4.7s
[CV 3/5] END ....max_depth=25, n_estimators=175;, score=0.921 total time=   4.7s
[CV 4/5] END ....max_depth=25, n_estimators=175;, score=0.916 total time=   4.5s
[CV 5/5] END ....max_depth=25, n_estimators=175;, score=0.916 total time=   4.5s
[CV 1/5] END ....max_depth=25, n_estimators=200;, score=0.917 total time=   5.2s
[CV 2/5] END ....max_depth=25, n_estimators=200

['../models/best_rf_model_val_acc_91.99.joblib']