In [29]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col , when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


spark = SparkSession.builder.appName("PredictionSession").config("spark.local.dir", "C:/temp").config("spark.executor.memory", "4g").config("spark.driver.memory", "4g").getOrCreate()

dataset_path = "../data/Transformed_GlobalFireBurnedArea_pandas.csv"
df = spark.read.csv(dataset_path, header=True, inferSchema=True)

# Afficher les 5 premières lignes des données initiales
print("===== Données Initiales =====")
df.show(5)

===== Données Initiales =====
+--------+-----------+---------+-----------+-------------+-----------+--------------------+----------+-------------------+------+-------------+
|      ID|Initialdate|Finaldate|    Area_ha|      Area_m2|   Area_Km2|         CountryName| Continent|             Region|Season|Duration_days|
+--------+-----------+---------+-----------+-------------+-----------+--------------------+----------+-------------------+------+-------------+
|25078590|       NULL|     NULL|50232.10763|5.023210763E8|502.3210763|               Ghana|    Africa|     Western Africa|Autumn|         NULL|
|25079092|       NULL|     NULL|82380.29538|8.238029538E8|823.8029538|             Nigeria|    Africa|     Western Africa|Autumn|         NULL|
|25079113|       NULL|     NULL|36851.12748|3.685112748E8|368.5112748|             Nigeria|    Africa|     Western Africa|Autumn|         NULL|
|25083241|       NULL|     NULL|43303.63519|4.330363519E8|433.0363519|             Nigeria|    Africa|    

In [30]:
df = df.withColumn(
    "severity",
    when(col("Area_Km2") < 50, "Low")
    .when((col("Area_Km2") >= 50) & (col("Area_Km2") < 1000), "Medium")
    .otherwise("High")
)

# Compter le nombre d'incendies par catégorie
df.groupBy("severity").count().show()


+--------+------+
|severity| count|
+--------+------+
|    High|   121|
|     Low|312489|
|  Medium|  6668|
+--------+------+



In [31]:
indexer = StringIndexer(
    inputCols=["severity", "Season", "CountryName", "Region", "Continent"],
    outputCols=["severity_index", "Season_index", "CountryName_index", "Region_index", "Continent_index"]
)
df = indexer.fit(df).transform(df)
print("===== Après encodage des colonnes catégoriques =====")
df.select("severity", "severity_index", "Season", "Season_index").show(5)

===== Après encodage des colonnes catégoriques =====
+--------+--------------+------+------------+
|severity|severity_index|Season|Season_index|
+--------+--------------+------+------------+
|  Medium|           1.0|Autumn|         0.0|
|  Medium|           1.0|Autumn|         0.0|
|  Medium|           1.0|Autumn|         0.0|
|  Medium|           1.0|Autumn|         0.0|
|  Medium|           1.0|Autumn|         0.0|
+--------+--------------+------+------------+
only showing top 5 rows



In [32]:
df.groupBy("Season").count().show()

+------+------+
|Season| count|
+------+------+
|Autumn|319278|
+------+------+



In [33]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, when

# Vérifier et gérer les valeurs nulles dans les colonnes
df = df.withColumn("Season_index", when(col("Season_index").isNull(), 0).otherwise(col("Season_index")))
df = df.withColumn("Duration_days", when(col("Duration_days").isNull(), 0).otherwise(col("Duration_days").cast("double")))
df = df.withColumn("CountryName_index", when(col("CountryName_index").isNull(), 0).otherwise(col("CountryName_index")))
df = df.withColumn("Region_index", when(col("Region_index").isNull(), 0).otherwise(col("Region_index")))
df = df.withColumn("Continent_index", when(col("Continent_index").isNull(), 0).otherwise(col("Continent_index")))

# Vérifier le schéma après gestion des valeurs nulles
print("Schéma après gestion des valeurs nulles :")
df.printSchema()

# Définir les colonnes d'entrée pour VectorAssembler
feature_cols = ["Season_index", "Duration_days", "CountryName_index", "Region_index", "Continent_index"]

# Initialiser VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Appliquer la transformation
df = assembler.transform(df)

# Afficher les résultats
print("===== Après assemblage des caractéristiques =====")
df.select("features", "severity_index", "Area_Km2").show(5, truncate=False)


Schéma après gestion des valeurs nulles :
root
 |-- ID: integer (nullable = true)
 |-- Initialdate: string (nullable = true)
 |-- Finaldate: string (nullable = true)
 |-- Area_ha: double (nullable = true)
 |-- Area_m2: double (nullable = true)
 |-- Area_Km2: double (nullable = true)
 |-- CountryName: string (nullable = true)
 |-- Continent: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Duration_days: double (nullable = true)
 |-- severity: string (nullable = false)
 |-- severity_index: double (nullable = false)
 |-- Season_index: double (nullable = false)
 |-- CountryName_index: double (nullable = false)
 |-- Region_index: double (nullable = false)
 |-- Continent_index: double (nullable = false)

===== Après assemblage des caractéristiques =====
+--------------------+--------------+-----------+
|features            |severity_index|Area_Km2   |
+--------------------+--------------+-----------+
|(5,[2,3],[17.0,2.0])|1.0         

In [34]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
print("===== Données d'entraînement =====")
train_data.show(5, truncate=False)
print("===== Données de test =====")
test_data.show(5, truncate=False)



===== Données d'entraînement =====
+--------+-----------+---------+-----------+-------------+-----------+---------------------------+------------+---------------------+------+-------------+--------+--------------+------------+-----------------+------------+---------------+-----------------------+
|ID      |Initialdate|Finaldate|Area_ha    |Area_m2      |Area_Km2   |CountryName                |Continent   |Region               |Season|Duration_days|severity|severity_index|Season_index|CountryName_index|Region_index|Continent_index|features               |
+--------+-----------+---------+-----------+-------------+-----------+---------------------------+------------+---------------------+------+-------------+--------+--------------+------------+-----------------+------------+---------------+-----------------------+
|25036465|NULL       |NULL     |214.8744509|2148744.509  |2.148744509|   United States of America|    Americas|     Northern America|Autumn|0.0          |Low     |0.0          

In [35]:


# Entraîner le modèle de classification
print("===== Objectif 1 : Prédiction de la gravité =====")
rf_classifier = RandomForestClassifier(labelCol="severity_index", featuresCol="features",maxBins=150)
rf_model = rf_classifier.fit(train_data)

# Faire des prédictions
predictions_classifier = rf_model.transform(test_data)
print("===== Prédictions pour la classification =====")
predictions_classifier.select("severity", "severity_index", "prediction", "probability").show(5)

# Évaluer les performances
classification_evaluator = MulticlassClassificationEvaluator(
    labelCol="severity_index", predictionCol="prediction", metricName="accuracy"
)
accuracy = classification_evaluator.evaluate(predictions_classifier)
print(f"Précision du modèle de classification : {accuracy:.2f}")

===== Objectif 1 : Prédiction de la gravité =====
===== Prédictions pour la classification =====
+--------+--------------+----------+--------------------+
|severity|severity_index|prediction|         probability|
+--------+--------------+----------+--------------------+
|     Low|           0.0|       0.0|[0.97800894014079...|
|     Low|           0.0|       0.0|[0.97800894014079...|
|     Low|           0.0|       0.0|[0.97800894014079...|
|     Low|           0.0|       0.0|[0.97800894014079...|
|     Low|           0.0|       0.0|[0.97800894014079...|
+--------+--------------+----------+--------------------+
only showing top 5 rows

Précision du modèle de classification : 0.98


In [36]:
test_data.groupBy("severity_index").count().show()


+--------------+-----+
|severity_index|count|
+--------------+-----+
|           0.0|62239|
|           1.0| 1314|
|           2.0|   24|
+--------------+-----+



In [37]:
# ---------------------
# 1. Entraîner le modèle de régression
# ---------------------
print("===== Objectif 2 : Prédiction de la taille =====")

# Modèle de RandomForestRegressor avec paramètres optimisés
rf_regressor = RandomForestRegressor(
    labelCol="Area_Km2",    # Colonne cible
    featuresCol="features", # Colonne des caractéristiques
    maxBins=150,            # Nombre maximum de bins (important pour les variables catégoriques)
    numTrees=100,           # Nombre d'arbres
    maxDepth=10             # Profondeur maximale des arbres
)

# Entraîner le modèle sur les données d'entraînement
rf_model_regressor = rf_regressor.fit(train_data)

# ---------------------
# 2. Faire des prédictions
# ---------------------
predictions_regressor = rf_model_regressor.transform(test_data)

# Afficher un échantillon des prédictions
print("===== Prédictions pour la régression =====")
predictions_regressor.select("Area_Km2", "prediction").show(10, truncate=False)

# ---------------------
# 3. Évaluer les performances
# ---------------------
# Initialiser l'évaluateur pour RMSE et MAE
regression_evaluator_rmse = RegressionEvaluator(
    labelCol="Area_Km2", predictionCol="prediction", metricName="rmse"
)
regression_evaluator_mae = RegressionEvaluator(
    labelCol="Area_Km2", predictionCol="prediction", metricName="mae"
)

# Calculer RMSE et MAE
rmse = regression_evaluator_rmse.evaluate(predictions_regressor)
mae = regression_evaluator_mae.evaluate(predictions_regressor)

# Afficher les métriques d'évaluation
print("===== Évaluation des performances =====")
print(f"RMSE (Root Mean Squared Error) : {rmse:.2f}")
print(f"MAE (Mean Absolute Error) : {mae:.2f}")

# ---------------------
# 4. Importance des caractéristiques
# ---------------------
print("===== Importance des caractéristiques =====")
feature_importances = rf_model_regressor.featureImportances
for col, importance in zip(feature_cols, feature_importances):
    print(f"{col}: {importance:.2f}")

===== Objectif 2 : Prédiction de la taille =====
===== Prédictions pour la régression =====
+-----------+-----------------+
|Area_Km2   |prediction       |
+-----------+-----------------+
|1.073939977|8.248445972048096|
|3.221117395|8.248445972048096|
|1.931822245|8.248445972048096|
|1.07226946 |7.042890727341005|
|1.28670946 |7.042890727341005|
|2.78783539 |7.042890727341005|
|7.290904512|7.042890727341005|
|3.429793055|7.042890727341005|
|3.643775961|7.042890727341005|
|2.572134754|7.042890727341005|
+-----------+-----------------+
only showing top 10 rows

===== Évaluation des performances =====
RMSE (Root Mean Squared Error) : 83.49
MAE (Mean Absolute Error) : 9.90
===== Importance des caractéristiques =====
Season_index: 0.00
Duration_days: 0.00
CountryName_index: 0.49
Region_index: 0.33
Continent_index: 0.18


In [38]:
predictions_regressor.select("Area_Km2", "prediction").show(20)


+-----------+-----------------+
|   Area_Km2|       prediction|
+-----------+-----------------+
|1.073939977|8.248445972048096|
|3.221117395|8.248445972048096|
|1.931822245|8.248445972048096|
| 1.07226946|7.042890727341005|
| 1.28670946|7.042890727341005|
| 2.78783539|7.042890727341005|
|7.290904512|7.042890727341005|
|3.429793055|7.042890727341005|
|3.643775961|7.042890727341005|
|2.572134754|7.042890727341005|
|2.357561481|7.042890727341005|
|1.500145118|7.042890727341005|
|1.928687191|7.042890727341005|
|3.213587668|7.042890727341005|
|1.284796405|7.042890727341005|
| 2.35506137|7.042890727341005|
|1.283970857|7.042890727341005|
|1.069900176|7.042890727341005|
|3.867095267|8.248445972048096|
|1.502286694|8.248445972048096|
+-----------+-----------------+
only showing top 20 rows

