In [1]:
# Chargement des librairies
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# Initialisation de la session Spark
spark = SparkSession.builder \
    .appName("Mushroom Classification with Metrics") \
    .getOrCreate()

# Chargement des données
file_path = "mushrooms.csv" 
data = spark.read.csv(file_path, header=True, inferSchema=True)

In [3]:
# Vérification des données
data.show(5)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [4]:
# Encodage des colonnes catégoriques avec StringIndexer
# Colonne de la classe (cible)
label_indexer = StringIndexer(inputCol="class", outputCol="label")
data = label_indexer.fit(data).transform(data)

# Colonnes catégoriques
indexed_columns = []
for column in data.columns:
    if column != "class" and column != "label":
        indexer = StringIndexer(inputCol=column, outputCol=column + "_indexed")
        data = indexer.fit(data).transform(data)
        indexed_columns.append(column + "_indexed")

# Création du vecteur des features
assembler = VectorAssembler(inputCols=indexed_columns, outputCol="features")
data = assembler.transform(data)

In [5]:
# Split du dataset
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Initialisation du modèle de classification
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [6]:
# Entraînement du modèle
model = dt.fit(train_data)

In [7]:
# Prédiction sur l'ensemble de test
predictions = model.transform(test_data)

# Évaluation du modèle
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy du modèle : {accuracy * 100:.2f}%")

Accuracy du modèle : 99.48%


In [8]:
# Calcul des autres métriques
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1_score = f1_evaluator.evaluate(predictions)

print(f"Precision : {precision* 100:.2f}%")
print(f"Recall : {recall* 100:.2f}%")
print(f"F1 Score : {f1_score* 100:.2f}%")

Precision : 99.49%
Recall : 99.48%
F1 Score : 99.48%


In [9]:
# Calcul de la matrice de confusion
confusion_df = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")
confusion_df.show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0|  788|
|  1.0|       0.0|    8|
|  1.0|       1.0|  755|
+-----+----------+-----+



In [10]:
# Fin de la session Spark
spark.stop()