In [134]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import math

import statsmodels.formula.api as smf
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers

pd.options.display.float_format = "{:,.2f}".format

In [135]:
#Download data

data_dir = '../data'
df = pd.read_csv(os.path.join(data_dir, 'all_merged.csv'))
df = df.drop(['Unnamed: 0'], axis=1)
df = df.drop(['dry_days'], axis=1)
df = df.astype({'land_cover': 'str'})
df['season'] = df['month'].apply(lambda x: np.cos(2*math.pi*(x+5)/12))
df["temp_prec"] = df["temperature"]/(df["precipitation"]+1)
df["temp_prec1"] = df["temperature1"]/(df["precipitation1"]+1)
df["temp_prec2"] = df["temperature2"]/(df["precipitation2"]+1)
df["temp_prec3"] = df["temperature3"]/(df["precipitation3"]+1)
df["temp_prec4"] = df["temperature4"]/(df["precipitation4"]+1)
df = df.dropna()
df = df.drop(['month','burnable_frac'], axis=1)
df = df.reset_index()
df = df.drop(['index'], axis=1)
#df = df[df.land_cover != 11]
#df = df.sample(n=100000)

#Create X and y
X = df.drop(columns=['burned_area','lon','lat','year'])
y = df['burned_area']
X

Unnamed: 0,temperature,precipitation,temperature1,precipitation1,temperature2,precipitation2,temperature3,precipitation3,temperature4,precipitation4,land_cover,country,season,temp_prec,temp_prec1,temp_prec2,temp_prec3,temp_prec4
0,13.50,48.70,14.70,105.20,15.90,20.00,18.80,31.90,23.10,7.80,11,Maroc / ⵍⵎⵖⵔⵉⴱ / المغرب,-1.00,0.27,0.14,0.76,0.57,2.62
1,14.20,8.90,13.50,48.70,14.70,105.20,15.90,20.00,18.80,31.90,11,Maroc / ⵍⵎⵖⵔⵉⴱ / المغرب,-0.87,1.43,0.27,0.14,0.76,0.57
2,17.20,25.30,14.20,8.90,13.50,48.70,14.70,105.20,15.90,20.00,11,Maroc / ⵍⵎⵖⵔⵉⴱ / المغرب,-0.50,0.65,1.43,0.27,0.14,0.76
3,18.00,1.80,17.20,25.30,14.20,8.90,13.50,48.70,14.70,105.20,11,Maroc / ⵍⵎⵖⵔⵉⴱ / المغرب,-0.00,6.43,0.65,1.43,0.27,0.14
4,19.20,9.30,18.00,1.80,17.20,25.30,14.20,8.90,13.50,48.70,11,Maroc / ⵍⵎⵖⵔⵉⴱ / المغرب,0.50,1.86,6.43,0.65,1.43,0.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2190493,22.40,40.50,22.40,22.80,18.80,29.50,8.90,19.20,0.00,52.00,11,Россия,1.00,0.54,0.94,0.62,0.44,0.00
2190494,19.60,30.20,22.40,40.50,22.40,22.80,18.80,29.50,8.90,19.20,11,Россия,0.87,0.63,0.54,0.94,0.62,0.44
2190495,12.90,25.50,19.60,30.20,22.40,40.50,22.40,22.80,18.80,29.50,11,Россия,0.50,0.49,0.63,0.54,0.94,0.62
2190496,9.50,24.20,12.90,25.50,19.60,30.20,22.40,40.50,22.40,22.80,11,Россия,0.00,0.38,0.49,0.63,0.54,0.94


In [136]:
X.dtypes

temperature       float64
precipitation     float64
temperature1      float64
precipitation1    float64
temperature2      float64
precipitation2    float64
temperature3      float64
precipitation3    float64
temperature4      float64
precipitation4    float64
land_cover         object
country            object
season            float64
temp_prec         float64
temp_prec1        float64
temp_prec2        float64
temp_prec3        float64
temp_prec4        float64
dtype: object

In [137]:
#Balance data

from sklearn.model_selection import train_test_split

# Split data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 42  # TEST = 30%
)

print(y_train.value_counts())

from imblearn.over_sampling import SMOTENC

smote_nc = SMOTENC(categorical_features=[10,11], random_state=0)
X_train_sm, y_train_sm = smote_nc.fit_resample(X_train, y_train)

print(y_train_sm.value_counts())


0    1414258
1     119090
Name: burned_area, dtype: int64


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


0    1414258
1    1414258
Name: burned_area, dtype: int64


In [138]:
#Build pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64','int64'])),
    (cat_transformer, make_column_selector(dtype_include=['object'])),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Add estimator
xgb_clas = XGBClassifier(max_depth=10, n_estimators=100, learning_rate=0.1)
pipeline = Pipeline(steps=[('preprocess', preproc),('model', xgb_clas)])
pipeline

In [139]:
#Run pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Train Pipeline
pipeline.fit(X_train_sm,y_train_sm)

# Make predictions
y_pred = pipeline.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(accuracy,precision,recall,f1)

#import shap

#explainer = shap.TreeExplainer(pipeline[1])
#shap_values = explainer.shap_values(preproc.transform(X_test))
#print(shap_values.shape)

#X_test_trans = preproc.transform(X_test).toarray()
#shap.summary_plot(shap_values, X_test_trans)

0.8617865023206269 0.3385345666991237 0.8175939171843462 0.47881149257756694


In [140]:
import joblib
joblib.dump(pipeline, 'fitted_model_2.joblib')

['fitted_model_2.joblib']