In [5]:
!pip install scikit-learn
!pip install pandas




In [6]:

from sklearn.preprocessing import LabelEncoder
import pandas as pd #import pandas library for reading CSV

# Load dataset
training_data = pd.read_csv('./Datasets/wildfires_training.csv')
test_data = pd.read_csv('./Datasets/wildfires_test.csv')
# Check to see if csv is loaded correctly
print(training_data.head())
print(test_data.head()) 

# Check to make sure test and training data have the same feature set.
training_first_row = training_data.iloc[0]  # Get the first row of the training dataset
test_first_row = test_data.iloc[0]          # Get the first row of the test dataset

# Assert that the first rows are equal
assert list(training_data.drop('fire', axis=1).columns) == list(test_data.drop('fire', axis=1).columns)


  fire  year  temp  humidity  rainfall  drought_code  buildup_index  day  \
0   no  2015    28        59       0.0          8.06           3.47    1   
1   no  2010    30        61       1.3          8.17           4.03    2   
2   no  2009    26        83      13.1          8.08           3.59    3   
3   no  2017    25        87       2.5          7.18           2.42    4   
4   no  2014    28        77       0.0         14.98           4.63    5   

   month  wind_speed  
0      6          19  
1      6          13  
2      6          22  
3      6          15  
4      6          18  
  fire  year  temp  humidity  rainfall  drought_code  buildup_index  day  \
0   no  2015    33        68       4.5          9.12           5.09   19   
1  yes  2009    28        56       0.0         38.17          21.21   12   
2   no  2017    30        64       0.6         15.38           6.24   24   
3   no  2007    23        74       8.3          7.36           2.27   14   
4   no  2017    31       

In [7]:
X_training = training_data.drop('fire', axis=1)  # Features (everything except "fire")
y_training = training_data['fire']   
print(X_training.head())  
print(y_training.head())           # Target variable (What I want to predict "fire" column)

   year  temp  humidity  rainfall  drought_code  buildup_index  day  month  \
0  2015    28        59       0.0          8.06           3.47    1      6   
1  2010    30        61       1.3          8.17           4.03    2      6   
2  2009    26        83      13.1          8.08           3.59    3      6   
3  2017    25        87       2.5          7.18           2.42    4      6   
4  2014    28        77       0.0         14.98           4.63    5      6   

   wind_speed  
0          19  
1          13  
2          22  
3          15  
4          18  
0    no
1    no
2    no
3    no
4    no
Name: fire, dtype: object


In [8]:

X_test = test_data.drop('fire', axis=1)  # Featuåres (everything except "fire")
y_test = test_data['fire']               # Target variable (What I want to predict "fire" column)
print(X_test.head())
print(y_test.head())  

   year  temp  humidity  rainfall  drought_code  buildup_index  day  month  \
0  2015    33        68       4.5          9.12           5.09   19      6   
1  2009    28        56       0.0         38.17          21.21   12      6   
2  2017    30        64       0.6         15.38           6.24   24      9   
3  2007    23        74       8.3          7.36           2.27   14      9   
4  2017    31        72       0.3         30.47           5.63    7      9   

   wind_speed  
0          16  
1          18  
2          19  
3          28  
4          17  
0     no
1    yes
2     no
3     no
4     no
Name: fire, dtype: object


In [9]:
# Entropy of the indepednent variable 
from scipy.stats import entropy
print(y_training.value_counts(normalize=True))  # Training data
training_entropy = entropy(y_training.value_counts(normalize=True), base=2)
print(f"Entropy of training data: {training_entropy}")

fire
yes    0.512987
no     0.487013
Name: proportion, dtype: float64
Entropy of training data: 0.9995132881417702


In [None]:
# Model fitting and accuracy evaluation
from sklearn.ensemble import HistGradientBoostingClassifier

#Model with default settings
hgbc_model = HistGradientBoostingClassifier()
hgbc_model.fit(X_training, y_training)

#Compute accuracy on the training predictions

print("Accuracy on training dataset provided:", accuracy)

Accuracy on training dataset provided: 1.0


In [None]:
#Compute accuracy on the training predictions without fine tuning
hgbc_model.predict(X_test)
accuracy = hgbc_model.score(X_test, y_test)

print("Accuracy on test dataset provided:", accuracy)

Accuracy on test dataset provided: 0.84


# Model Hyperparameter Fine Tuning

In [26]:
hgbc_model = HistGradientBoostingClassifier(learning_rate = 1.0, min_samples_leaf=23)
hgbc_model.fit(X_training, y_training)
hgbc_model.predict(X_training)
accuracy = hgbc_model.score(X_training, y_training)
print("Accuracy on training dataset provided:", accuracy)

Accuracy on training dataset provided: 1.0


In [27]:
hgbc_model.predict(X_test)
accuracy = hgbc_model.score(X_test, y_test)

print("Accuracy on test dataset with learning_rate provided:", accuracy)

Accuracy on test dataset with learning_rate provided: 0.9


In [14]:
# Attempting to find the best min_samples_leaf value
def find_best_min_samples_leaf(X_training, y_training, X_test, y_test, learning_rate=1.0, min_samples_range=range(1, 51)):
    """
    Finds the best value for min_samples_leaf that gives the highest accuracy on the test dataset.
    
    Parameters:
        X_training (DataFrame): Training features.
        y_training (Series): Training target.
        X_test (DataFrame): Test features.
        y_test (Series): Test target.
        learning_rate (float): Learning rate for the model.
        min_samples_range (range): Range of values for min_samples_leaf to test.
    
    Returns:
        best_min_samples_leaf (int): The value of min_samples_leaf that gives the highest accuracy.
        best_accuracy (float): The highest accuracy achieved.
    """
    best_min_samples_leaf = None
    best_accuracy = 0.0

    for min_samples_leaf in min_samples_range:
        # Create and train the model
        hgbc_model = HistGradientBoostingClassifier(learning_rate=learning_rate, min_samples_leaf=min_samples_leaf)
        hgbc_model.fit(X_training, y_training)
        
        # Evaluate the model
        accuracy = hgbc_model.score(X_test, y_test)
        print(f"min_samples_leaf: {min_samples_leaf}, Accuracy: {accuracy}")
        
        # Update the best parameters if the current accuracy is higher
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_min_samples_leaf = min_samples_leaf

    print(f"\nBest min_samples_leaf: {best_min_samples_leaf}, Best Accuracy: {best_accuracy}")
    return best_min_samples_leaf, best_accuracy

# Example usage:
best_min_samples_leaf, best_accuracy = find_best_min_samples_leaf(X_training, y_training, X_test, y_test)

min_samples_leaf: 1, Accuracy: 0.86
min_samples_leaf: 2, Accuracy: 0.84
min_samples_leaf: 3, Accuracy: 0.88
min_samples_leaf: 4, Accuracy: 0.84
min_samples_leaf: 5, Accuracy: 0.86
min_samples_leaf: 6, Accuracy: 0.86
min_samples_leaf: 7, Accuracy: 0.86
min_samples_leaf: 8, Accuracy: 0.88
min_samples_leaf: 9, Accuracy: 0.86
min_samples_leaf: 10, Accuracy: 0.86
min_samples_leaf: 11, Accuracy: 0.88
min_samples_leaf: 12, Accuracy: 0.88
min_samples_leaf: 13, Accuracy: 0.86
min_samples_leaf: 14, Accuracy: 0.86
min_samples_leaf: 15, Accuracy: 0.86
min_samples_leaf: 16, Accuracy: 0.88
min_samples_leaf: 17, Accuracy: 0.84
min_samples_leaf: 18, Accuracy: 0.86
min_samples_leaf: 19, Accuracy: 0.88
min_samples_leaf: 20, Accuracy: 0.88
min_samples_leaf: 21, Accuracy: 0.86
min_samples_leaf: 22, Accuracy: 0.88
min_samples_leaf: 23, Accuracy: 0.9
min_samples_leaf: 24, Accuracy: 0.86
min_samples_leaf: 25, Accuracy: 0.9
min_samples_leaf: 26, Accuracy: 0.86
min_samples_leaf: 27, Accuracy: 0.88
min_samples_

In [15]:
hgbc_model.predict(X_test)
accuracy = hgbc_model.score(X_test, y_test)

print("Accuracy on test dataset provided:", accuracy)

Accuracy on test dataset provided: 0.9
