## Entraining

In [12]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline



## Indexation

In [13]:
df.columns

['Airlines',
 'Total_Stops',
 'Price',
 'Duration',
 'Destination',
 'Date',
 'Airlines_list',
 'Pays_Destination',
 'Depart']

In [14]:

# Convertir les variables catégorielles en indices numériques
indexers = [StringIndexer(inputCol=column,   outputCol=column + "_index").fit(df) 
            for column in ["Airlines","Date", "Destination"]]


# Appliquer chaque StringIndexer au DataFrame
for indexer in indexers:
    df = indexer.transform(df)


In [15]:
df.columns

['Airlines',
 'Total_Stops',
 'Price',
 'Duration',
 'Destination',
 'Date',
 'Airlines_list',
 'Pays_Destination',
 'Depart',
 'Airlines_index',
 'Date_index',
 'Destination_index']

In [16]:



assembler = VectorAssembler(
    inputCols=[ "Airlines_index","Date_index", "Destination_index", "Total_Stops", "Duration"],
    outputCol="features"
)
df = assembler.transform(df)




In [17]:
df.na.drop()
df.columns



['Airlines',
 'Total_Stops',
 'Price',
 'Duration',
 'Destination',
 'Date',
 'Airlines_list',
 'Pays_Destination',
 'Depart',
 'Airlines_index',
 'Date_index',
 'Destination_index',
 'features']

In [18]:
# colonnes nécessaires pour l'entraînement
df = df.select(col("features"), col("Price").alias("label"))

## Modeles

In [None]:



# 1. Création du Random Forest
rf = RandomForestRegressor(
    featuresCol="features", 
    labelCol="label",  
    numTrees=100,      # nombre d'arbres
    maxDepth=10,       # profondeur maximale des arbres
    seed=42 ,          # reproductibilité
    maxBins=210 #nombre de valeurs unique
)

# 2. Séparation train/test
(train_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)

# 3. Entraînement du modèle
model = rf.fit(train_data)

# 4. Prédictions
predictions = model.transform(test_data)

# 5. Évaluation
evaluator_rmse = RegressionEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="rmse"
)
evaluator_r2 = RegressionEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="r2"
)

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")



24/12/01 15:00:50 WARN DAGScheduler: Broadcasting large task binary with size 1052.6 KiB
24/12/01 15:00:50 WARN DAGScheduler: Broadcasting large task binary with size 1744.1 KiB
24/12/01 15:00:51 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
24/12/01 15:00:54 WARN DAGScheduler: Broadcasting large task binary with size 4.8 MiB
24/12/01 15:00:56 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
24/12/01 15:00:57 WARN DAGScheduler: Broadcasting large task binary with size 1293.9 KiB
24/12/01 15:01:00 WARN DAGScheduler: Broadcasting large task binary with size 11.4 MiB
24/12/01 15:01:00 WARN DAGScheduler: Broadcasting large task binary with size 1718.8 KiB
                                                                                

RMSE: 81.8292703240851
R2: 0.8481208313507199


In [20]:

# Prédire les prix pour de nouvelles données
new_data = spark.createDataFrame(
    [("Vueling, Iberia",1, "Madrid", 5.3333335,"2025-06-29")],
    ["Airlines","Total_Stops", "Destination", "Duration", "Date"]
)


In [21]:
# Transformer les nouvelles données en utilisant les mêmes StringIndexers et assembler
for indexer in indexers:
    new_data = indexer.transform(new_data)

new_data = assembler.transform(new_data)
new_data = new_data.select("features")

# Prédire le prix pour les nouvelles données
predicted_price = model.transform(new_data)
predicted_price.select("prediction").show()

                                                                                

+------------------+
|        prediction|
+------------------+
|145.12609545696793|
+------------------+



In [23]:
#Model Save

model.save("./model/rf_model")

24/12/01 15:04:18 WARN TaskSetManager: Stage 86 contains a task of very large size (1011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                