# Analyse de données et modèle de prédiction

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

### Initialize Spark session

In [2]:
spark = SparkSession.builder \
    .appName("Vehicle Category Prediction") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
def print_df(df):
    print("Size", df.count())
    df.printSchema()
    df.show(5)

def load_data_in_csv_file(file_path):
    df = spark.read.options(delimiter=",", header=True, inferSchema=True).csv(file_path)

    # Display schema and first five rows of the DataFrame
    print("Schema and first rows in", file_path)
    print_df(df)
    
    return df

### Load the dataset

In [8]:
# File path in Hadoop FS
hdfs_path = "hdfs:///tpa_groupe_14/data/fusion/fusion.csv"

# Load CSV data into a DataFrame
vehicles_df = load_data_in_csv_file(hdfs_path)

Schema and first rows in hdfs:///tpa_groupe_14/data/fusion/fusion.csv
Size 100306
root
 |-- clients_ext.id: integer (nullable = true)
 |-- clients_ext.age: integer (nullable = true)
 |-- clients_ext.sexe: string (nullable = true)
 |-- clients_ext.taux: integer (nullable = true)
 |-- clients_ext.situationfamiliale: string (nullable = true)
 |-- clients_ext.nbenfantsacharge: integer (nullable = true)
 |-- clients_ext.deuxiemevoiture: boolean (nullable = true)
 |-- clients_ext.immatriculation: string (nullable = true)
 |-- immatriculation_ext.immatriculation: string (nullable = true)
 |-- immatriculation_ext.marque: string (nullable = true)
 |-- immatriculation_ext.nom: string (nullable = true)
 |-- immatriculation_ext.puissance: integer (nullable = true)
 |-- immatriculation_ext.longueur: string (nullable = true)
 |-- immatriculation_ext.nbplaces: integer (nullable = true)
 |-- immatriculation_ext.nbportes: integer (nullable = true)
 |-- immatriculation_ext.couleur: string (nullable = tr

### Data formatting

In [9]:
columns_to_drop = ["immatriculation_ext.immatriculation", "immatriculation_ext.marque", "immatriculation_ext.nom",
                   "immatriculation_ext.puissance", "immatriculation_ext.longueur", "immatriculation_ext.nbplaces",
                   "immatriculation_ext.nbportes", "immatriculation_ext.couleur", "immatriculation_ext.occasion",
                   "immatriculation_ext.prix", "immatriculation_ext.cluster", "catalogue_co2_ext.rejetco2",
                   "catalogue_co2_ext.marque", "clients_ext.immatriculation"]
vehicles_df = vehicles_df.drop(*columns_to_drop)

column_names = vehicles_df.schema.names

for column_name in column_names:
    vehicles_df = vehicles_df.withColumnRenamed(column_name, column_name.replace(".", "_"))

vehicles_df = vehicles_df.withColumn(
    "clients_ext_situationfamiliale", 
    when(col("clients_ext_situationfamiliale")=='Seul', "CELIBATAIRE") \
    .when(col("clients_ext_situationfamiliale")=='Seule', "CELIBATAIRE") \
    .when(col("clients_ext_situationfamiliale")=='Célibataire', "CELIBATAIRE") \
    .when(col("clients_ext_situationfamiliale")=='En Couple', "EN_COUPLE") \
    .when(col("clients_ext_situationfamiliale")=='Marié(e)', "MARIE") \
    .when(col("clients_ext_situationfamiliale")=='Divorcée', "DIVORCE") \
    .otherwise("*")
)

vehicles_df = vehicles_df.withColumn(
    "clients_ext_sexe", 
    when(col("clients_ext_sexe") == 'Homme', "M") \
    .when(col("clients_ext_sexe") == 'Masculin', "M") \
    .when(col("clients_ext_sexe") == 'M', "M") \
    .when(col("clients_ext_sexe") == 'Femme', "F") \
    .when(col("clients_ext_sexe") == 'Féminin', "F") \
    .when(col("clients_ext_sexe") == 'F', "F") \
    .otherwise("*")
)

# Replace negative values with zero
vehicles_df = vehicles_df.withColumn("clients_ext_deuxiemevoiture", when(col("clients_ext_deuxiemevoiture") == True, 1).otherwise(0))
vehicles_df = vehicles_df.withColumn("clients_ext_age", when(col("clients_ext_age") < 0, 0).otherwise(col("clients_ext_age")))
vehicles_df = vehicles_df.withColumn("clients_ext_taux", when(col("clients_ext_taux") < 0, 0).otherwise(col("clients_ext_taux")))
vehicles_df = vehicles_df.withColumn("clients_ext_nbenfantsacharge", when(col("clients_ext_nbenfantsacharge") < 0, 0).otherwise(col("clients_ext_nbenfantsacharge")))

# Print the schema to verify data types
print_df(vehicles_df)

Size 100306
root
 |-- clients_ext_id: integer (nullable = true)
 |-- clients_ext_age: integer (nullable = true)
 |-- clients_ext_sexe: string (nullable = false)
 |-- clients_ext_taux: integer (nullable = true)
 |-- clients_ext_situationfamiliale: string (nullable = false)
 |-- clients_ext_nbenfantsacharge: integer (nullable = true)
 |-- clients_ext_deuxiemevoiture: integer (nullable = false)
 |-- immatriculation_ext_categorie: string (nullable = true)

+--------------+---------------+----------------+----------------+------------------------------+----------------------------+---------------------------+-----------------------------+
|clients_ext_id|clients_ext_age|clients_ext_sexe|clients_ext_taux|clients_ext_situationfamiliale|clients_ext_nbenfantsacharge|clients_ext_deuxiemevoiture|immatriculation_ext_categorie|
+--------------+---------------+----------------+----------------+------------------------------+----------------------------+---------------------------+-------------------

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer

### Optional

In [None]:
# Optional: Sub-sample the DataFrame
vehicles_df = vehicles_df.sample(withReplacement=False, fraction=0.3, seed=42)

In [11]:
# Prepare features
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(vehicles_df)
    for column in ["clients_ext_sexe", "clients_ext_situationfamiliale", "immatriculation_ext_categorie"]
]

assembler = VectorAssembler(
    inputCols=[
        "clients_ext_age", "clients_ext_taux", "clients_ext_nbenfantsacharge", "clients_ext_deuxiemevoiture",
        "clients_ext_sexe_index", "clients_ext_situationfamiliale_index"
    ],
    outputCol="features"
)

# Split the DataFrame into training and test sets (80% training, 20% test)
train_df, test_df = vehicles_df.randomSplit([0.8, 0.2], seed=42)

pipeline = Pipeline(stages=indexers + [assembler])
pipeline_model = pipeline.fit(train_df)

train_df = pipeline_model.transform(train_df)
test_df = pipeline_model.transform(test_df)

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [13]:
# Evaluation function
def evaluate_classifier(classifier, param_grid, train_df, test_df):
    evaluator = MulticlassClassificationEvaluator(labelCol="immatriculation_ext_categorie_index", predictionCol="prediction", metricName="accuracy")
    
    # Create the cross-validator
    cross_validator = CrossValidator(
        estimator=classifier,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3, 
        seed=42
    )
    
    # Train the model with the best hyperparameters
    cv_model = cross_validator.fit(train_df)
    
    # Make predictions on the test data
    predictions = cv_model.transform(test_df)
    
    # Evaluate the model
    accuracy = evaluator.evaluate(predictions)
    
    return cv_model, accuracy

In [14]:
from pyspark.ml.classification import (
    DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, 
    LinearSVC, LogisticRegression, MultilayerPerceptronClassifier, NaiveBayes, OneVsRest
)

1. Random Forest

In [15]:
rf = RandomForestClassifier(labelCol="immatriculation_ext_categorie_index", featuresCol="features")

# Define the hyperparameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

rf_model, rf_accuracy = evaluate_classifier(
    classifier=rf,
    param_grid=rf_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("RandomForestClassifier Accuracy = {:.2f}".format(rf_accuracy))

24/05/29 14:03:36 WARN DAGScheduler: Broadcasting large task binary with size 1018.5 KiB
24/05/29 14:03:46 WARN DAGScheduler: Broadcasting large task binary with size 1198.7 KiB
24/05/29 14:03:48 WARN DAGScheduler: Broadcasting large task binary with size 1856.0 KiB
24/05/29 14:03:49 WARN DAGScheduler: Broadcasting large task binary with size 1063.4 KiB
24/05/29 14:03:59 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB
24/05/29 14:04:08 WARN DAGScheduler: Broadcasting large task binary with size 1236.5 KiB
24/05/29 14:04:09 WARN DAGScheduler: Broadcasting large task binary with size 1906.2 KiB
24/05/29 14:04:10 WARN DAGScheduler: Broadcasting large task binary with size 1095.7 KiB
24/05/29 14:04:19 WARN DAGScheduler: Broadcasting large task binary with size 1079.2 KiB
24/05/29 14:04:28 WARN DAGScheduler: Broadcasting large task binary with size 1301.0 KiB
24/05/29 14:04:29 WARN DAGScheduler: Broadcasting large task binary with size 2025.8 KiB
24/05/29 14:04:31 WAR

RandomForestClassifier Accuracy = 0.72


                                                                                

2. Decision Tree

In [16]:
dt = DecisionTreeClassifier(labelCol="immatriculation_ext_categorie_index", featuresCol="features")

dt_param_grid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [5, 10])\
    .addGrid(dt.impurity, ["gini", "entropy"])\
    .build()

dt_model, dt_accuracy = evaluate_classifier(
    classifier=dt,
    param_grid=dt_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("DecisionTreeClassifier Accuracy = {:.2f}".format(dt_accuracy))

                                                                                

DecisionTreeClassifier Accuracy = 0.72


3. Gradient Boosted Trees

In [18]:
gbt = GBTClassifier(labelCol="immatriculation_ext_categorie_index", featuresCol="features")
ovr = OneVsRest(classifier=gbt, labelCol="immatriculation_ext_categorie_index", featuresCol="features")

gbt_param_grid = ParamGridBuilder()\
    .addGrid(gbt.maxIter, [10, 20])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .build()

gbt_model, gbt_accuracy = evaluate_classifier(
    classifier=ovr,
    param_grid=gbt_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("GBTClassifier Accuracy = {:.2f}".format(gbt_accuracy))

24/05/29 14:07:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1011.9 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1058.0 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1054.4 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1054.9 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1055.4 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1056.6 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1059.1 KiB
24/05/29 14:08:01 WARN DAGScheduler: Broadcasting large task binary with size 1063.5 KiB
24/05/29 14:08:02 WARN DAGScheduler: Broadcasting large task binary with size 1072.1 KiB
24/05/29 14:08:02 WARN DAGScheduler: Broadcasting large task binary with size 1089.0 KiB
24/

GBTClassifier Accuracy = 0.72


                                                                                

4. Support Vector Machine

In [19]:
svm = LinearSVC(labelCol="immatriculation_ext_categorie_index", featuresCol="features")
ovr = OneVsRest(classifier=svm, labelCol="immatriculation_ext_categorie_index", featuresCol="features")

svm_param_grid = ParamGridBuilder()\
    .addGrid(svm.maxIter, [10, 20])\
    .addGrid(svm.regParam, [0.01, 0.1])\
    .build()

svm_model, svm_accuracy = evaluate_classifier(
    classifier=ovr,
    param_grid=svm_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("LinearSVC Accuracy = {:.2f}".format(svm_accuracy))



LinearSVC Accuracy = 0.63


                                                                                

5. Logistic Regression

In [20]:
lr = LogisticRegression(labelCol="immatriculation_ext_categorie_index", featuresCol="features")

lr_param_grid = ParamGridBuilder()\
    .addGrid(lr.maxIter, [10, 20])\
    .addGrid(lr.regParam, [0.01, 0.1])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

lr_model, lr_accuracy = evaluate_classifier(
    classifier=lr,
    param_grid=lr_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("LogisticRegression Accuracy = {:.2f}".format(lr_accuracy))

                                                                                

LogisticRegression Accuracy = 0.65


6. Neural Networks (Multilayer Perceptron Classifier)

In [21]:
num_classes = train_df.select("immatriculation_ext_categorie_index").distinct().count()

mlp = MultilayerPerceptronClassifier(labelCol="immatriculation_ext_categorie_index", featuresCol="features", layers=[6, 5, 4, num_classes])

mlp_param_grid = ParamGridBuilder()\
    .addGrid(mlp.maxIter, [50, 100])\
    .build()

mlp_model, mlp_accuracy = evaluate_classifier(
    classifier=mlp,
    param_grid=mlp_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("MultilayerPerceptronClassifier Accuracy = {:.2f}".format(mlp_accuracy))

24/05/29 14:26:09 ERROR LBFGS: Failure! Resetting history: breeze.optimize.FirstOrderException: Line search zoom failed
24/05/29 14:26:10 ERROR LBFGS: Failure! Resetting history: breeze.optimize.FirstOrderException: Line search failed
                                                                                

MultilayerPerceptronClassifier Accuracy = 0.56


7. Naive Bayes

In [22]:
nb = NaiveBayes(labelCol="immatriculation_ext_categorie_index", featuresCol="features")

nb_param_grid = ParamGridBuilder()\
    .addGrid(nb.smoothing, [0.5, 1.0, 1.5])\
    .build()

nb_model, nb_accuracy = evaluate_classifier(
    classifier=nb,
    param_grid=nb_param_grid,
    train_df=train_df,
    test_df=test_df
)

print("NaiveBayes Accuracy = {:.2f}".format(nb_accuracy))

                                                                                

NaiveBayes Accuracy = 0.46
