### Modules Import

- We have used numpy and pandas for data handling and scikit-learn for preprocessing and training the machine learning algorithm.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.distance import cdist

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, LabelEncoder, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import  XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, auc

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pickle

### Datasets Import from MODIS (FIRMS - NASA Dataset)

- We have used data from FIRMS (Fire Information for Resource Management System) from NASA: https://firms.modaps.eosdis.nasa.gov/download/

- We have used the data to get the location (longitude and latitude) and the date-time of fires in Spain during the last 3 years.

In [4]:
df_2020 = pd.read_csv('modis_2020_Spain.csv')
df_2020.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,43.1717,-6.3778,322.3,1.1,1.0,2020-01-01,242,Aqua,MODIS,100,6.03,276.9,28.9,N,0
1,43.2003,-4.2303,316.5,1.7,1.3,2020-01-01,1348,Aqua,MODIS,77,6.03,273.1,42.1,D,0
2,43.1186,-3.8166,315.3,1.0,1.0,2020-01-02,1111,Terra,MODIS,76,6.03,281.4,17.0,D,0
3,43.128,-3.8204,301.3,1.0,1.0,2020-01-02,1111,Terra,MODIS,33,6.03,281.0,6.2,D,0
4,39.1699,-2.1818,302.4,1.1,1.0,2020-01-02,1112,Terra,MODIS,49,6.03,282.4,7.9,D,0


In [5]:
df_2021 = pd.read_csv('modis_2021_Spain.csv')
df_2021.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,37.0886,-5.8802,300.4,1.1,1.0,2021-01-02,1125,Terra,MODIS,37,6.03,287.8,5.1,D,0
1,36.6599,-4.8777,305.5,1.8,1.3,2021-01-02,1304,Aqua,MODIS,62,6.03,282.8,21.3,D,0
2,36.8729,-2.4486,304.8,1.2,1.1,2021-01-02,2228,Terra,MODIS,62,6.03,277.7,13.9,N,0
3,36.8745,-2.4347,334.7,1.2,1.1,2021-01-02,2228,Terra,MODIS,100,6.03,279.1,53.3,N,2
4,40.2352,-0.8699,320.1,1.1,1.1,2021-01-04,1112,Terra,MODIS,80,6.03,276.0,26.4,D,0


In [6]:
df_2022 = pd.read_csv('modis_2022_Spain.csv')
df_2022.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,43.2475,-3.7776,303.2,1.0,1.0,2022-01-01,224,Aqua,MODIS,54,6.03,283.1,7.9,N,0
1,43.2201,-3.6591,302.6,1.0,1.0,2022-01-01,224,Aqua,MODIS,50,6.03,281.4,8.2,N,0
2,43.4104,-5.01,312.9,1.0,1.0,2022-01-01,224,Aqua,MODIS,86,6.03,285.6,14.6,N,0
3,43.4086,-4.9979,309.4,1.0,1.0,2022-01-01,224,Aqua,MODIS,77,6.03,283.9,11.9,N,0
4,43.2009,-3.7151,309.2,1.0,1.0,2022-01-01,224,Aqua,MODIS,77,6.03,282.5,12.1,N,0


In [7]:
print(df_2020.info())
print(df_2021.info())
print(df_2022.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2602 entries, 0 to 2601
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   latitude    2602 non-null   float64
 1   longitude   2602 non-null   float64
 2   brightness  2602 non-null   float64
 3   scan        2602 non-null   float64
 4   track       2602 non-null   float64
 5   acq_date    2602 non-null   object 
 6   acq_time    2602 non-null   int64  
 7   satellite   2602 non-null   object 
 8   instrument  2602 non-null   object 
 9   confidence  2602 non-null   int64  
 10  version     2602 non-null   float64
 11  bright_t31  2602 non-null   float64
 12  frp         2602 non-null   float64
 13  daynight    2602 non-null   object 
 14  type        2602 non-null   int64  
dtypes: float64(8), int64(3), object(4)
memory usage: 305.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4725 entries, 0 to 4724
Data columns (total 15 columns):
 #   Column      Non

- We have performed some data cleaning (formatting) to cross this information with the Meteomatics database.

In [8]:
def convertir_formato(valor):
    valor_str = str(valor)

    while len(valor_str) < 4:
        valor_str = '0' + valor_str

    horas = int(valor_str[:2])
    return '{:02}'.format(horas)

df_2020["horas_nuevas"] = df_2020['acq_time'].apply(convertir_formato)
df_2020['horas_nuevas'] = df_2020['horas_nuevas'].astype('int64')
df_2020["horas_nuevas"].head()

0     2
1    13
2    11
3    11
4    11
Name: horas_nuevas, dtype: int64

In [9]:
df_2020 = df_2020.drop('acq_time', axis=1)
df_2020['acq_date'] = df_2020['acq_date'].str.replace('-', '').astype(int)
print(df_2020.info())
df_2020.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2602 entries, 0 to 2601
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   latitude      2602 non-null   float64
 1   longitude     2602 non-null   float64
 2   brightness    2602 non-null   float64
 3   scan          2602 non-null   float64
 4   track         2602 non-null   float64
 5   acq_date      2602 non-null   int64  
 6   satellite     2602 non-null   object 
 7   instrument    2602 non-null   object 
 8   confidence    2602 non-null   int64  
 9   version       2602 non-null   float64
 10  bright_t31    2602 non-null   float64
 11  frp           2602 non-null   float64
 12  daynight      2602 non-null   object 
 13  type          2602 non-null   int64  
 14  horas_nuevas  2602 non-null   int64  
dtypes: float64(8), int64(4), object(3)
memory usage: 305.0+ KB
None


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,horas_nuevas
0,43.1717,-6.3778,322.3,1.1,1.0,20200101,Aqua,MODIS,100,6.03,276.9,28.9,N,0,2
1,43.2003,-4.2303,316.5,1.7,1.3,20200101,Aqua,MODIS,77,6.03,273.1,42.1,D,0,13
2,43.1186,-3.8166,315.3,1.0,1.0,20200102,Terra,MODIS,76,6.03,281.4,17.0,D,0,11
3,43.128,-3.8204,301.3,1.0,1.0,20200102,Terra,MODIS,33,6.03,281.0,6.2,D,0,11
4,39.1699,-2.1818,302.4,1.1,1.0,20200102,Terra,MODIS,49,6.03,282.4,7.9,D,0,11


### Datasets Import from Meteomatic

- We created a python script (also in GitHub) to make a Request of the information to the API of Meteomatics https://www.meteomatics.com/en/weather-api/

- We downloaded the information as csv files and did some data cleaning.

In [10]:
# import datasets 
df_data_Sevilla = pd.read_csv("data_Sevilla.csv")
df_data_Asturias = pd.read_csv("data_Asturias.csv")
df_data_Cantabria = pd.read_csv("data_Cantabria.csv")
df_data_Galicia = pd.read_csv("data_galicia.csv")
df_data_Murcia = pd.read_csv("data_murcia.csv")
df_data_Navarra =pd.read_csv("data_Navarra.csv")

In [11]:
df_data_meteo = pd.concat([df_data_Sevilla, df_data_Asturias, df_data_Cantabria, df_data_Galicia, df_data_Murcia,
df_data_Navarra], axis=0)

df_data_meteo['Fecha'] = df_data_meteo['Fecha'].str.replace('-', '').astype(int)

df_data_meteo.shape

(52704, 8)

### Merge of two databases

- We have merged the information of the two databases to cross the meteorological conditions in a specific date-time and location where a fire was detected. 

In [12]:
# Inicializar una lista para almacenar los resultados
merged_data = []

# Iterar sobre cada fila en df_2020
for _, row_df_2020 in df_2020.iterrows():
    # Filtrar filas en df_data_meteo que coinciden en Fecha y Hora
    filtered_data_meteo = df_data_meteo[
        (df_data_meteo['Fecha'] == row_df_2020['acq_date']) &
        (df_data_meteo['Hora'] == row_df_2020['horas_nuevas'])]
    
    # Calcular la distancia euclidiana entre las coordenadas de ambos conjuntos de datos filtrados
    distances = cdist([[row_df_2020['latitude'], row_df_2020['longitude']]], 
                      filtered_data_meteo[['Latitud', 'Longitud']])
    
    # Si hay filas coincidentes en df_data_meteo, encontrar el índice de la fila más cercana
    if not filtered_data_meteo.empty:
        closest_row_index = distances.argmin(axis=1)[0]
        closest_row_meteo = filtered_data_meteo.iloc[closest_row_index]
        
        # Almacenar los resultados en merged_data
        merged_data.append(pd.concat([row_df_2020, closest_row_meteo]))
    else:
        # Si no hay filas coincidentes en df_data_meteo, almacenar la fila de df_2020 con NaN para las columnas de df_data_meteo
        merged_data.append(pd.concat([row_df_2020, pd.Series(index=df_data_meteo.columns, dtype='object')]))

# Crear un DataFrame a partir de la lista de resultados
merged_data_df = pd.DataFrame(merged_data)

# merged_data_df ahora contiene las filas unidas de ambos conjuntos de datos
print(merged_data_df.head())

print(merged_data_df.shape)

   latitude  longitude  brightness  scan  track  acq_date satellite  \
0   43.1717    -6.3778       322.3   1.1    1.0  20200101      Aqua   
1   43.2003    -4.2303       316.5   1.7    1.3  20200101      Aqua   
2   43.1186    -3.8166       315.3   1.0    1.0  20200102     Terra   
3   43.1280    -3.8204       301.3   1.0    1.0  20200102     Terra   
4   39.1699    -2.1818       302.4   1.1    1.0  20200102     Terra   

  instrument  confidence  version  ...  type  horas_nuevas Unnamed: 0  t_2m:C  \
0      MODIS         100     6.03  ...     0             2        2.0     6.5   
1      MODIS          77     6.03  ...     0            13       13.0     9.4   
2      MODIS          76     6.03  ...     0            11       35.0    10.9   
3      MODIS          33     6.03  ...     0            11       35.0    10.9   
4      MODIS          49     6.03  ...     0            11       35.0    10.0   

   precip_1h:mm  wind_speed_10m:ms    Latitud  Longitud       Fecha  Hora  
0         

### Column to predict (y label)

- We added a column y label to train the ML model with it.

In [13]:
merged_data_df['fire'] = 1
print(merged_data_df.shape)
print(merged_data_df.head())

(2602, 24)
   latitude  longitude  brightness  scan  track  acq_date satellite  \
0   43.1717    -6.3778       322.3   1.1    1.0  20200101      Aqua   
1   43.2003    -4.2303       316.5   1.7    1.3  20200101      Aqua   
2   43.1186    -3.8166       315.3   1.0    1.0  20200102     Terra   
3   43.1280    -3.8204       301.3   1.0    1.0  20200102     Terra   
4   39.1699    -2.1818       302.4   1.1    1.0  20200102     Terra   

  instrument  confidence  version  ...  horas_nuevas  Unnamed: 0 t_2m:C  \
0      MODIS         100     6.03  ...             2         2.0    6.5   
1      MODIS          77     6.03  ...            13        13.0    9.4   
2      MODIS          76     6.03  ...            11        35.0   10.9   
3      MODIS          33     6.03  ...            11        35.0   10.9   
4      MODIS          49     6.03  ...            11        35.0   10.0   

   precip_1h:mm  wind_speed_10m:ms    Latitud  Longitud       Fecha  Hora  \
0           0.0                0.9

- We created some control samples (negative examples: where and when a fire did not occur) to balance the dataset.

In [14]:
number_samples = 2602

df_control = df_data_meteo.sample(n=number_samples, random_state=42)

print(df_control.shape)
df_control.head()

(2602, 8)


Unnamed: 0.1,Unnamed: 0,t_2m:C,precip_1h:mm,wind_speed_10m:ms,Latitud,Longitud,Fecha,Hora
3665,3665,17.7,1.72,0.5,42.61946,-7.863112,20200601,17
1605,1605,4.2,0.0,2.5,42.61946,-7.863112,20200307,21
6737,6737,16.4,0.0,1.9,43.313387,-5.94192,20201007,17
4266,4266,24.2,0.0,2.9,42.612549,-1.830788,20200626,18
5748,5748,38.1,0.0,2.2,37.38863,-5.99534,20200827,12


In [15]:
df_control['fire'] = 0
print(df_control.shape)
print(df_control.head())

(2602, 9)
      Unnamed: 0  t_2m:C  precip_1h:mm  wind_speed_10m:ms    Latitud  \
3665        3665    17.7          1.72                0.5  42.619460   
1605        1605     4.2          0.00                2.5  42.619460   
6737        6737    16.4          0.00                1.9  43.313387   
4266        4266    24.2          0.00                2.9  42.612549   
5748        5748    38.1          0.00                2.2  37.388630   

      Longitud     Fecha  Hora  fire  
3665 -7.863112  20200601    17     0  
1605 -7.863112  20200307    21     0  
6737 -5.941920  20201007    17     0  
4266 -1.830788  20200626    18     0  
5748 -5.995340  20200827    12     0  


In [16]:
df_final_1 = merged_data_df[['latitude', 'longitude', 'acq_date', 'horas_nuevas', 't_2m:C',
'precip_1h:mm', 'wind_speed_10m:ms', 'fire']]

print(df_final_1.fire.value_counts())

df_final_2 = df_control[['Latitud', 'Longitud', 'Fecha', 'Hora', 't_2m:C',
'precip_1h:mm', 'wind_speed_10m:ms', 'fire']]

print(df_final_2.fire.value_counts())

1    2602
Name: fire, dtype: int64
0    2602
Name: fire, dtype: int64


In [17]:
column_names = {'Latitud': 'latitude', 'Longitud': 'longitude', 'Fecha': 'acq_date', 'Hora': 'horas_nuevas',
't_2m:C': 't_2m:C', 'precip_1h:mm': 'precip_1h:mm', 'wind_speed_10m:ms': 'wind_speed_10m:ms', 'fire': 'fire'}

df_final_2.rename(columns=column_names, inplace=True)

df_final_2.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Index(['latitude', 'longitude', 'acq_date', 'horas_nuevas', 't_2m:C',
       'precip_1h:mm', 'wind_speed_10m:ms', 'fire'],
      dtype='object')

In [18]:
df_final = pd.concat([df_final_1, df_final_2], axis=0)

print(df_final.shape)

print(df_final.head())

(5204, 8)
   latitude  longitude  acq_date  horas_nuevas  t_2m:C  precip_1h:mm  \
0   43.1717    -6.3778  20200101             2     6.5           0.0   
1   43.2003    -4.2303  20200101            13     9.4           0.0   
2   43.1186    -3.8166  20200102            11    10.9           0.0   
3   43.1280    -3.8204  20200102            11    10.9           0.0   
4   39.1699    -2.1818  20200102            11    10.0           0.0   

   wind_speed_10m:ms  fire  
0                0.9     1  
1                1.0     1  
2                0.9     1  
3                0.9     1  
4                0.8     1  


In [19]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5204 entries, 0 to 8465
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   latitude           5204 non-null   float64
 1   longitude          5204 non-null   float64
 2   acq_date           5204 non-null   int64  
 3   horas_nuevas       5204 non-null   int64  
 4   t_2m:C             5204 non-null   float64
 5   precip_1h:mm       5204 non-null   float64
 6   wind_speed_10m:ms  5204 non-null   float64
 7   fire               5204 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 365.9 KB


### Preprocessing

In [20]:
X = df_final.drop('fire', axis=1)

y = df_final['fire']

In [21]:
cols = X.columns.to_list()

In [22]:
preprocessor = ColumnTransformer([('minmax', MinMaxScaler(), cols)])

In [23]:
X_transformed = preprocessor.fit_transform(X)

print(X_transformed)

[[9.76176375e-01 5.47190030e-01 0.00000000e+00 ... 2.16335541e-01
  0.00000000e+00 6.10687023e-02]
 [9.77997695e-01 6.48566101e-01 0.00000000e+00 ... 2.80353201e-01
  0.00000000e+00 6.87022901e-02]
 [9.72794834e-01 6.68095452e-01 8.84955753e-04 ... 3.13465784e-01
  0.00000000e+00 6.10687023e-02]
 ...
 [9.85199345e-01 5.67766422e-01 1.12389381e-01 ... 4.70198675e-01
  0.00000000e+00 8.39694656e-02]
 [6.41520706e-01 7.78493653e-01 7.14159292e-01 ... 4.59161148e-01
  0.00000000e+00 9.16030534e-02]
 [6.07896013e-01 5.65244634e-01 9.88495575e-01 ... 3.55408389e-01
  0.00000000e+00 1.06870229e-01]]


### Functions for Model Testing

In [24]:
def prediction(model, data_to_predict):
    '''
    This function receives the model to be applied and the matrix X (predictor variables).
    It returns the y that it predicts.
    '''
    y_predicted = model.predict(data_to_predict)
    
    probability = model.predict_proba(data_to_predict)
    
    #tomamos como clase positiva (1)
    y_probs = probability[:, model.classes_.tolist().index(1)]
    
    return y_predicted, y_probs

In [25]:
def test(model, data_to_predict, y_predicted, y_true, y_probs):
    '''
    This function calculates the model metrics and prints them.
    It takes as positive class (pos_label) a 1
    Cross validation is applied to obtain all the metrics.
    '''
    eval_metrics = ['accuracy', 'precision', 'recall', 'f1']
    model_scores = cross_validate(model, data_to_predict, y_true, cv=3, scoring = eval_metrics, return_train_score = True)
    
    acc = np.mean(model_scores['test_accuracy'])
    precision = np.mean(model_scores['test_precision'])
    recall = np.mean(model_scores['test_recall'])
    f1 = np.mean(model_scores['test_f1'])
    
    overfit_acc = (np.mean(model_scores['train_accuracy']) - acc) * 100
    overfit_f1 = (np.mean(model_scores['train_f1']) - f1) * 100
    
    conf_matrix = confusion_matrix(y_true, y_predicted)
   
    print('Model: {} || Accuracy: {} || Precision: {} || Recall: {} || F1: {}'.format(model, acc, precision, recall, f1))
    print()
    print('Model: {} || Overfitting Accuracy: {}, || Overfitting F1: {}'.format(model, overfit_acc, overfit_f1))
    print()
    print("Matriz de Confusión:")
    print(conf_matrix)
    print()

In [26]:
def train_predict_test(model, data_to_predict, y_true):
    '''
    This function groups all the previous ones to make a single call.
    '''
    print(f"Métricas de {model}:")

    model.fit(data_to_predict, y_true)

    y_predicted, y_probs = prediction(model, data_to_predict)

    test(model, data_to_predict, y_predicted, y_true, y_probs)

### List of Models

In [27]:
models = [
    RandomForestClassifier(), 
    LogisticRegression(), 
    AdaBoostClassifier(n_estimators=50),
    GradientBoostingClassifier(learning_rate=0.3,n_estimators=50),
    LGBMClassifier(),
    XGBClassifier(),
    KNeighborsClassifier()
]

### Model Testing

In [28]:
for model in models:
    train_predict_test(model, X_transformed, y)

Métricas de RandomForestClassifier():
Model: RandomForestClassifier() || Accuracy: 0.6481273551405079 || Precision: 0.9991181657848324 || Recall: 0.2971871860698739 || F1: 0.3241191609868372

Model: RandomForestClassifier() || Overfitting Accuracy: 35.18726448594921, || Overfitting F1: 67.58808390131628

Matriz de Confusión:
[[2602    0]
 [   0 2602]]

Métricas de LogisticRegression():
Model: LogisticRegression() || Accuracy: 0.5276593905912933 || Precision: 0.44874064099306854 || Recall: 0.3798202392911623 || F1: 0.38983253611509977

Model: LogisticRegression() || Overfitting Accuracy: 19.476466504184444, || Overfitting F1: 33.87640768860129

Matriz de Confusión:
[[1832  770]
 [ 908 1694]]

Métricas de AdaBoostClassifier():
Model: AdaBoostClassifier() || Accuracy: 0.8287233795026742 || Precision: 1.0 || Recall: 0.6574522914795266 || F1: 0.667082312371901

Model: AdaBoostClassifier() || Overfitting Accuracy: 17.127662049732585, || Overfitting F1: 33.2917687628099

Matriz de Confusión:


### Best Model from the ones Tested

In [29]:
gboost = GradientBoostingClassifier(learning_rate=0.3,n_estimators=50)

train_predict_test(gboost, X_transformed, y)

Métricas de GradientBoostingClassifier(learning_rate=0.3, n_estimators=50):
Model: GradientBoostingClassifier(learning_rate=0.3, n_estimators=50) || Accuracy: 0.9850137444365777 || Precision: 1.0 || Recall: 0.9700416536000865 || F1: 0.984482750986109

Model: GradientBoostingClassifier(learning_rate=0.3, n_estimators=50) || Overfitting Accuracy: 1.4986255563422257, || Overfitting F1: 1.5517249013890955

Matriz de Confusión:
[[2602    0]
 [   0 2602]]



In [30]:
with open('gboost', 'wb') as file_pkl:
    pickle.dump(gboost, file_pkl)
    print('Model file successfully created')

Model file successfully created


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4207af36-f281-4be2-893b-6189098b9a73' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>