# Analyse de données et modèle de prédiction

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

### Initialize Spark session

In [2]:
spark = SparkSession.builder \
    .appName("Vehicle Category Prediction") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/09 07:39:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
def print_df(df):
    print("Size", df.count())
    df.printSchema()
    df.show(5)

def load_data_in_csv_file(file_path):
    df = spark.read.options(delimiter=",", header=True, inferSchema=True).csv(file_path)

    # Display schema and first five rows of the DataFrame
    print("Schema and first rows in", file_path)
    print_df(df)
    
    return df

### Load the dataset

In [4]:
# File path in Hadoop FS
hdfs_path = "hdfs:///tpa_groupe_14/data/fusion/fusion.csv"

# Load CSV data into a DataFrame
vehicles_df = load_data_in_csv_file(hdfs_path)

                                                                                

Schema and first rows in hdfs:///tpa_groupe_14/data/fusion/fusion.csv
Size 100306
root
 |-- client_view.id: integer (nullable = true)
 |-- client_view.age: integer (nullable = true)
 |-- client_view.sexe: string (nullable = true)
 |-- client_view.taux: integer (nullable = true)
 |-- client_view.situationfamiliale: string (nullable = true)
 |-- client_view.nbenfantsacharge: integer (nullable = true)
 |-- client_view.deuxiemevoiture: integer (nullable = true)
 |-- immatriculation_co2_view.categorie: string (nullable = true)

+--------------+---------------+----------------+----------------+------------------------------+----------------------------+---------------------------+----------------------------------+
|client_view.id|client_view.age|client_view.sexe|client_view.taux|client_view.situationfamiliale|client_view.nbenfantsacharge|client_view.deuxiemevoiture|immatriculation_co2_view.categorie|
+--------------+---------------+----------------+----------------+-------------------------

### Data formatting

In [5]:
column_names = vehicles_df.schema.names

for column_name in column_names:
    vehicles_df = vehicles_df.withColumnRenamed(column_name, column_name.replace(".", "_"))

print_df(vehicles_df)

Size 100306
root
 |-- client_view_id: integer (nullable = true)
 |-- client_view_age: integer (nullable = true)
 |-- client_view_sexe: string (nullable = true)
 |-- client_view_taux: integer (nullable = true)
 |-- client_view_situationfamiliale: string (nullable = true)
 |-- client_view_nbenfantsacharge: integer (nullable = true)
 |-- client_view_deuxiemevoiture: integer (nullable = true)
 |-- immatriculation_co2_view_categorie: string (nullable = true)

+--------------+---------------+----------------+----------------+------------------------------+----------------------------+---------------------------+----------------------------------+
|client_view_id|client_view_age|client_view_sexe|client_view_taux|client_view_situationfamiliale|client_view_nbenfantsacharge|client_view_deuxiemevoiture|immatriculation_co2_view_categorie|
+--------------+---------------+----------------+----------------+------------------------------+----------------------------+---------------------------+-------

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer

### Optional

In [7]:
# Optional: Sub-sample the DataFrame
vehicles_df = vehicles_df.sample(withReplacement=False, fraction=0.3, seed=42)

In [8]:
# Prepare features
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(vehicles_df)
    for column in ["client_view_sexe", "client_view_situationfamiliale", "immatriculation_co2_view_categorie"]
]

assembler = VectorAssembler(
    inputCols=[
        "client_view_age", "client_view_taux", "client_view_nbenfantsacharge", "client_view_deuxiemevoiture",
        "client_view_sexe_index", "client_view_situationfamiliale_index"
    ],
    outputCol="features"
)

# Split the DataFrame into training and test sets (80% training, 20% test)
train_df, test_df = vehicles_df.randomSplit([0.8, 0.2], seed=42)

pipeline = Pipeline(stages=indexers + [assembler])
pipeline_model = pipeline.fit(train_df)

train_df = pipeline_model.transform(train_df)
test_df = pipeline_model.transform(test_df)

                                                                                

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

# La précision & Matrice de confusion

In [27]:
# Evaluation function
def evaluate_classifier(classifier, param_grid, train_df, test_df):
    evaluator = MulticlassClassificationEvaluator(labelCol="immatriculation_co2_view_categorie_index", predictionCol="prediction", metricName="accuracy")
    
    # Create the cross-validator
    cross_validator = CrossValidator(
        estimator=classifier,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3, 
        seed=42
    )
    
    # Train the model with the best hyperparameters
    cv_model = cross_validator.fit(train_df)
    
    # Make predictions on the test data
    predictions = cv_model.transform(test_df)
    
    # Evaluate the model
    accuracy = evaluator.evaluate(predictions)
    
    # Calcul de la matrice de confusion
    predictions_and_labels = predictions.select("prediction", "immatriculation_co2_view_categorie_index")
    prediction_rdd = predictions_and_labels.rdd.map(tuple)
    
    metrics = MulticlassMetrics(prediction_rdd)
    confusion_matrix = metrics.confusionMatrix().toArray()
    
    return cv_model, accuracy, confusion_matrix

In [11]:
from pyspark.ml.classification import (
    DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, 
    LinearSVC, LogisticRegression, MultilayerPerceptronClassifier, NaiveBayes, OneVsRest
)

1. Random Forest

In [28]:
rf = RandomForestClassifier(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")

# Define the hyperparameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

rf_model, rf_accuracy, rf_confusion_matrix = evaluate_classifier(
    classifier=rf,
    param_grid=rf_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("RandomForestClassifier Accuracy = {:.2f}".format(rf_accuracy))
print("RandomForestClassifier Confusion Matrix:\n", rf_confusion_matrix)

24/06/09 08:05:40 WARN DAGScheduler: Broadcasting large task binary with size 1142.4 KiB
24/06/09 08:05:40 WARN DAGScheduler: Broadcasting large task binary with size 1696.6 KiB
24/06/09 08:05:41 WARN DAGScheduler: Broadcasting large task binary with size 1061.5 KiB
24/06/09 08:05:48 WARN DAGScheduler: Broadcasting large task binary with size 1156.0 KiB
24/06/09 08:05:48 WARN DAGScheduler: Broadcasting large task binary with size 1721.9 KiB
24/06/09 08:05:49 WARN DAGScheduler: Broadcasting large task binary with size 1082.8 KiB
24/06/09 08:05:55 WARN DAGScheduler: Broadcasting large task binary with size 1118.4 KiB
24/06/09 08:05:56 WARN DAGScheduler: Broadcasting large task binary with size 1664.2 KiB
24/06/09 08:05:57 WARN DAGScheduler: Broadcasting large task binary with size 1036.7 KiB
24/06/09 08:06:00 WARN DAGScheduler: Broadcasting large task binary with size 1179.6 KiB
24/06/09 08:06:01 WARN DAGScheduler: Broadcasting large task binary with size 1781.8 KiB
24/06/09 08:06:01 WAR

RandomForestClassifier Accuracy = 0.71
RandomForestClassifier Confusion Matrix:
 [[2937.  314.    6.   31.]
 [ 520. 1020.    0.    4.]
 [ 466.    3.  296.    0.]
 [ 353.   22.    0.   39.]]


2. Decision Tree

In [29]:
dt = DecisionTreeClassifier(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")

dt_param_grid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [5, 10])\
    .addGrid(dt.impurity, ["gini", "entropy"])\
    .build()

dt_model, dt_accuracy, dt_confusion_matrix = evaluate_classifier(
    classifier=dt,
    param_grid=dt_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("DecisionTreeClassifier Accuracy = {:.2f}".format(dt_accuracy))
print("DecisionTreeClassifier Confusion Matrix:\n", dt_confusion_matrix)

DecisionTreeClassifier Accuracy = 0.72
DecisionTreeClassifier Confusion Matrix:
 [[2940.  306.    5.   37.]
 [ 515. 1029.    0.    0.]
 [ 465.    3.  297.    0.]
 [ 346.   22.    0.   46.]]


3. Gradient Boosted Trees

In [30]:
gbt = GBTClassifier(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")
ovr = OneVsRest(classifier=gbt, labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")

gbt_param_grid = ParamGridBuilder()\
    .addGrid(gbt.maxIter, [10, 20])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .build()

gbt_model, gbt_accuracy, gbt_confusion_matrix = evaluate_classifier(
    classifier=ovr,
    param_grid=gbt_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("GBTClassifier Accuracy = {:.2f}".format(gbt_accuracy))
print("GBTClassifier Confusion Matrix:\n", gbt_confusion_matrix)

24/06/09 08:07:15 WARN DAGScheduler: Broadcasting large task binary with size 1000.3 KiB
24/06/09 08:07:33 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
24/06/09 08:08:08 WARN DAGScheduler: Broadcasting large task binary with size 1000.3 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1001.8 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1006.0 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1014.2 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1027.9 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1050.5 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1089.0 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1087.7 KiB
24/06/09 08:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1088.2 KiB
24/06/09 08:08:09 WARN D

GBTClassifier Accuracy = 0.72
GBTClassifier Confusion Matrix:
 [[2932.  326.    4.   26.]
 [ 498. 1046.    0.    0.]
 [ 465.    3.  297.    0.]
 [ 357.   22.    0.   35.]]


                                                                                

4. Support Vector Machine

In [31]:
svm = LinearSVC(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")
ovr = OneVsRest(classifier=svm, labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")

svm_param_grid = ParamGridBuilder()\
    .addGrid(svm.maxIter, [10, 20])\
    .addGrid(svm.regParam, [0.01, 0.1])\
    .build()

svm_model, svm_accuracy, svm_confusion_matrix = evaluate_classifier(
    classifier=ovr,
    param_grid=svm_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("LinearSVC Accuracy = {:.2f}".format(svm_accuracy))
print("LinearSVC Confusion Matrix:\n", svm_confusion_matrix)

                                                                                

LinearSVC Accuracy = 0.62
LinearSVC Confusion Matrix:
 [[2.888e+03 4.000e+02 0.000e+00 0.000e+00]
 [7.570e+02 7.860e+02 1.000e+00 0.000e+00]
 [6.070e+02 9.300e+01 6.500e+01 0.000e+00]
 [3.930e+02 2.100e+01 0.000e+00 0.000e+00]]


                                                                                

5. Logistic Regression

In [32]:
lr = LogisticRegression(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")

lr_param_grid = ParamGridBuilder()\
    .addGrid(lr.maxIter, [10, 20])\
    .addGrid(lr.regParam, [0.01, 0.1])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

lr_model, lr_accuracy, lr_confusion_matrix = evaluate_classifier(
    classifier=lr,
    param_grid=lr_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("LogisticRegression Accuracy = {:.2f}".format(lr_accuracy))
print("LogisticRegression Confusion Matrix:\n", lr_confusion_matrix)

LogisticRegression Accuracy = 0.65
LogisticRegression Confusion Matrix:
 [[2.877e+03 3.330e+02 6.800e+01 1.000e+01]
 [7.870e+02 7.200e+02 3.600e+01 1.000e+00]
 [4.380e+02 4.000e+01 2.870e+02 0.000e+00]
 [3.660e+02 2.600e+01 0.000e+00 2.200e+01]]


6. Neural Networks (Multilayer Perceptron Classifier)

In [33]:
num_classes = train_df.select("immatriculation_co2_view_categorie_index").distinct().count()

mlp = MultilayerPerceptronClassifier(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features", layers=[6, 5, 4, num_classes])

mlp_param_grid = ParamGridBuilder()\
    .addGrid(mlp.maxIter, [50, 100])\
    .build()

mlp_model, mlp_accuracy, mlp_confusion_matrix = evaluate_classifier(
    classifier=mlp,
    param_grid=mlp_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("MultilayerPerceptronClassifier Accuracy = {:.2f}".format(mlp_accuracy))
print("MultilayerPerceptronClassifier Confusion Matrix:\n", lr_confusion_matrix)

MultilayerPerceptronClassifier Accuracy = 0.55
MultilayerPerceptronClassifier Confusion Matrix:
 [[2.877e+03 3.330e+02 6.800e+01 1.000e+01]
 [7.870e+02 7.200e+02 3.600e+01 1.000e+00]
 [4.380e+02 4.000e+01 2.870e+02 0.000e+00]
 [3.660e+02 2.600e+01 0.000e+00 2.200e+01]]


7. Naive Bayes

In [34]:
nb = NaiveBayes(labelCol="immatriculation_co2_view_categorie_index", featuresCol="features")

nb_param_grid = ParamGridBuilder()\
    .addGrid(nb.smoothing, [0.5, 1.0, 1.5])\
    .build()

nb_model, nb_accuracy, nb_confusion_matrix = evaluate_classifier(
    classifier=nb,
    param_grid=nb_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("NaiveBayes Accuracy = {:.2f}".format(nb_accuracy))
print("NaiveBayes Confusion Matrix:\n", nb_confusion_matrix)

NaiveBayes Accuracy = 0.45
NaiveBayes Confusion Matrix:
 [[1135.  912.  986.  255.]
 [ 390.  970.   84.  100.]
 [ 209.  103.  453.    0.]
 [ 151.  101.   14.  148.]]


# Prediction

- Charger Marketing

In [None]:
# Chemin vers le fichier Marketing dans Hadoop FS
marketing_hdfs_path = "hdfs:///tpa_groupe_14/data/marketing/marketing.csv"

# Charger les données Marketing dans un DataFrame
marketing_df = spark.read.csv(marketing_hdfs_path, header=True, inferSchema=True)


Prétraiter les Données Marketing

In [None]:
# Renommer les colonnes pour éviter les conflits
for column_name in marketing_df.columns:
    marketing_df = marketing_df.withColumnRenamed(column_name, column_name.replace(".", "_"))

# Utiliser le pipeline formé précédemment pour transformer les données
marketing_df = pipeline_model.transform(marketing_df)


Appliquer le Modèle de Prédiction

In [None]:
# Utiliser le modèle formé pour faire des prédictions sur les données Marketing
predictions = rf_model.transform(marketing_df)

# Sélectionner les colonnes pertinentes pour l'affichage
results = predictions.select("client_view_id", "prediction")


Exporter les Résultats

In [None]:
# Chemin de sortie pour les résultats
results_hdfs_path = "hdfs:///tpa_groupe_14/results/marketing_predictions.csv"

# Sauvegarder les résultats dans HDFS
results.write.csv(results_hdfs_path, header=True)
