# Analyse de données et modèle de prédiction

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

### Initialize Spark session

In [2]:
spark = SparkSession.builder \
    .appName("Vehicle Category Prediction") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [34]:
def print_df(df):
    print("Size", df.count())
    df.printSchema()
    df.show(5)

def load_data_in_csv_file(file_path, delimit = ",", head=True):
    df = spark.read.options(delimiter=",", header=True, inferSchema=True).csv(file_path)

    # Display schema and first five rows of the DataFrame
    print("Schema and first rows in", file_path)
    print_df(df)
    
    return df

### Load the dataset

In [111]:
# File path in Hadoop FS
hdfs_path = "hdfs:///tpa_groupe_14/data/fusion/fusion.csv"

# Load CSV data into a DataFrame
vehicles_df = load_data_in_csv_file(hdfs_path)

Schema and first rows in hdfs:///tpa_groupe_14/data/fusion/fusion.csv
Size 100306
root
 |-- client_view.id: integer (nullable = true)
 |-- client_view.age: integer (nullable = true)
 |-- client_view.sexe: string (nullable = true)
 |-- client_view.taux: integer (nullable = true)
 |-- client_view.situationfamiliale: string (nullable = true)
 |-- client_view.nbenfantsacharge: integer (nullable = true)
 |-- client_view.deuxiemevoiture: integer (nullable = true)
 |-- immatriculation_co2_view.categorie: string (nullable = true)

+--------------+---------------+----------------+----------------+------------------------------+----------------------------+---------------------------+----------------------------------+
|client_view.id|client_view.age|client_view.sexe|client_view.taux|client_view.situationfamiliale|client_view.nbenfantsacharge|client_view.deuxiemevoiture|immatriculation_co2_view.categorie|
+--------------+---------------+----------------+----------------+-------------------------

### Data formatting

In [112]:
column_names = vehicles_df.schema.names

for column_name in column_names:
    vehicles_df = vehicles_df.withColumnRenamed(column_name, column_name.split(".")[-1])

print_df(vehicles_df)

Size 100306
root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: integer (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantsacharge: integer (nullable = true)
 |-- deuxiemevoiture: integer (nullable = true)
 |-- categorie: string (nullable = true)

+---+---+----+----+------------------+----------------+---------------+---------+
| id|age|sexe|taux|situationfamiliale|nbenfantsacharge|deuxiemevoiture|categorie|
+---+---+----+----+------------------+----------------+---------------+---------+
| 34| 18|   M| 461|         EN_COUPLE|               4|              0| Compacte|
| 43| 30|   M| 243|         EN_COUPLE|               2|              0|     Luxe|
| 59| 79|   M| 404|         EN_COUPLE|               2|              0|Familiale|
| 63| 44|   M| 925|         EN_COUPLE|               0|              0|     Luxe|
| 75| 70|   M| 458|       CELIBATAIRE|               0|              0|     

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer

### Optional

In [7]:
# Optional: Sub-sample the DataFrame
vehicles_df = vehicles_df.sample(withReplacement=False, fraction=0.3, seed=42)

In [130]:
train_df.show(5)

+---+---+----+----+------------------+----------------+---------------+---------+----------+------------------------+---------------+--------------------+
| id|age|sexe|taux|situationfamiliale|nbenfantsacharge|deuxiemevoiture|categorie|sexe_index|situationfamiliale_index|categorie_index|            features|
+---+---+----+----+------------------+----------------+---------------+---------+----------+------------------------+---------------+--------------------+
|  1| 36|   M|1168|       CELIBATAIRE|               0|              0|     Luxe|       0.0|                     1.0|            0.0|[36.0,1168.0,0.0,...|
|  2| 77|   M| 971|         EN_COUPLE|               2|              0|Familiale|       0.0|                     0.0|            1.0|[77.0,971.0,2.0,0...|
|  4| 35|   F| 404|         EN_COUPLE|               2|              0| Citadine|       1.0|                     0.0|            3.0|[35.0,404.0,2.0,0...|
|  5| 56|   M|1146|         EN_COUPLE|               1|              1

In [113]:
# Prepare features
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(vehicles_df)
    for column in ["sexe", "situationfamiliale", "categorie"]
]

assembler = VectorAssembler(
    inputCols=[
        "age", "taux", "nbenfantsacharge", "deuxiemevoiture",
        "sexe_index", "situationfamiliale_index"
    ],
    outputCol="features"
)

# Split the DataFrame into training and test sets (80% training, 20% test)
train_df, test_df = vehicles_df.randomSplit([0.8, 0.2], seed=42)

pipeline = Pipeline(stages=indexers + [assembler])
pipeline_model = pipeline.fit(train_df)

train_df = pipeline_model.transform(train_df)
test_df = pipeline_model.transform(test_df)

In [154]:
label_predictions = test_df.select("categorie", "categorie_index").distinct().rdd.collect()
categorie_dict = {row['categorie_index']: row['categorie'] for row in label_predictions}

print(categorie_dict)

{0.0: 'Luxe', 1.0: 'Familiale', 3.0: 'Citadine', 2.0: 'Compacte'}


In [114]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

# La précision & Matrice de confusion

In [197]:
# Evaluation function
def evaluate_classifier(classifier, param_grid, train_df, test_df, title):
    evaluator = MulticlassClassificationEvaluator(labelCol="categorie_index", predictionCol="prediction", metricName="accuracy")
    
    # Create the cross-validator
    cross_validator = CrossValidator(
        estimator=classifier,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3, 
        seed=42
    )
    
    # Train the model with the best hyperparameters
    cv_model = cross_validator.fit(train_df)
    
    # Make predictions on the test data
    predictions = cv_model.transform(test_df)
    
    # Evaluate the model
    accuracy = evaluator.evaluate(predictions)
    
    # Calcul de la matrice de confusion
    predictions_and_labels = predictions.select("prediction", "categorie_index")
    prediction_rdd = predictions_and_labels.rdd.map(tuple)
    
    metrics = MulticlassMetrics(prediction_rdd)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Calcul des taux de succès par classe
    labels = predictions.select("categorie_index").distinct().rdd.flatMap(lambda x: x).collect()
    
    per_class_metrics = {}
    for label in labels:
        per_class_metrics[label] = {
            "precision": metrics.precision(label),
            "recall": metrics.recall(label),
            "f1_score": metrics.fMeasure(label)
        }
    
    # Calcul des métriques globales pondérées
    weighted_metrics = {
        "precision": metrics.weightedPrecision,
        "recall": metrics.weightedRecall,
        "f1_score": metrics.weightedFMeasure(),
        "accuracy": accuracy
    }

    print(title + " Accuracy = {:.2f}".format(accuracy))
    print("\nWeighted Metrics:")
    print(f"  Weighted Precision: {weighted_metrics['precision']:.2f}")
    print(f"  Weighted Recall: {weighted_metrics['recall']:.2f}")
    print(f"  Weighted F1 Score: {weighted_metrics['f1_score']:.2f}")
    print(f"  Accuracy: {weighted_metrics['accuracy']:.2f}")
    
    return {
        "classifier": title,
        "cv_model": cv_model,
        "accuracy": accuracy,
        "confusion_matrix": confusion_matrix,
        "per_class_metrics": per_class_metrics,
        "weighted_metrics": weighted_metrics
    }

In [190]:
from pyspark.ml.classification import (
    DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, 
    LinearSVC, LogisticRegression, MultilayerPerceptronClassifier, NaiveBayes, OneVsRest
)

In [198]:
def calculate_composite_score(metrics, weights):
    score = 0
    
    for metric, value in metrics.items():
        score += weights.get(metric, 0) * value
        
    return score

In [199]:
def compare_classifiers(classifiers, param_grids, train_df, test_df, titles):
    results = {}
    
    for classifier, param_grid, title in zip(classifiers, param_grids, titles):
        print(f"\nEvaluating {title}...\n")
        result = evaluate_classifier(classifier, param_grid, train_df, test_df, title)
        weighted_metrics = result['weighted_metrics']
        composite_score = calculate_composite_score(weighted_metrics, weighted_metrics)
        results[title] = { "composite_score": composite_score, "cv_model": result["cv_model"], "classifier": result["classifier"] }
    
    # Rank the models based on composite scores
    ranked_results = sorted(results.items(), key=lambda item: item[1]["composite_score"], reverse=True)

    print("Model with highest score:", ranked_results[0]["classifier"], "[", ranked_results[0]["composite_score"], "]")
    
    return ranked_results[0]["cv_model"]

1. Random Forest

In [200]:
rf = RandomForestClassifier(labelCol="categorie_index", featuresCol="features")

# Define the hyperparameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

dt = DecisionTreeClassifier(labelCol="categorie_index", featuresCol="features")

dt_param_grid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [5, 10])\
    .addGrid(dt.impurity, ["gini", "entropy"])\
    .build()

gbt = GBTClassifier(labelCol="categorie_index", featuresCol="features")
gbt_ovr = OneVsRest(classifier=gbt, labelCol="categorie_index", featuresCol="features")

gbt_param_grid = ParamGridBuilder()\
    .addGrid(gbt.maxIter, [10, 20])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .build()

svm = LinearSVC(labelCol="categorie_index", featuresCol="features")
svm_ovr = OneVsRest(classifier=svm, labelCol="categorie_index", featuresCol="features")

svm_param_grid = ParamGridBuilder()\
    .addGrid(svm.maxIter, [10, 20])\
    .addGrid(svm.regParam, [0.01, 0.1])\
    .build()
  
lr = LogisticRegression(labelCol="categorie_index", featuresCol="features")

lr_param_grid = ParamGridBuilder()\
    .addGrid(lr.maxIter, [10, 20])\
    .addGrid(lr.regParam, [0.01, 0.1])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

num_classes = train_df.select("categorie_index").distinct().count()

mlp = MultilayerPerceptronClassifier(labelCol="categorie_index", featuresCol="features", layers=[6, 5, 4, num_classes])

mlp_param_grid = ParamGridBuilder()\
    .addGrid(mlp.maxIter, [50, 100])\
    .build()

nb = NaiveBayes(labelCol="categorie_index", featuresCol="features")

nb_param_grid = ParamGridBuilder()\
    .addGrid(nb.smoothing, [0.5, 1.0, 1.5])\
    .build()


classifiers = [rf, dt, gbt_ovr, svm_ovr,lr , mlp, nb ]
param_grids = [rf_param_grid, dt_param_grid, gbt_param_grid, svm_param_grid,lr_param_grid , mlp_param_grid, nb_param_grid ]
titles = ["RandomForestClassifier", "DecisionTreeClassifier", "GBTClassifier", "LinearSVC","LogisticRegression" , "MultilayerPerceptronClassifier", "NaiveBayes" ]

# Assuming train_df and test_df are already defined
model = compare_classifiers(classifiers, param_grids, train_df, test_df, titles)


Evaluating RandomForestClassifier...



24/06/09 16:45:06 WARN DAGScheduler: Broadcasting large task binary with size 1032.0 KiB
24/06/09 16:45:14 WARN DAGScheduler: Broadcasting large task binary with size 1251.8 KiB
24/06/09 16:45:15 WARN DAGScheduler: Broadcasting large task binary with size 1937.0 KiB
24/06/09 16:45:16 WARN DAGScheduler: Broadcasting large task binary with size 1077.0 KiB
24/06/09 16:45:21 WARN DAGScheduler: Broadcasting large task binary with size 1059.6 KiB
24/06/09 16:45:30 WARN DAGScheduler: Broadcasting large task binary with size 1235.2 KiB
24/06/09 16:45:30 WARN DAGScheduler: Broadcasting large task binary with size 1907.2 KiB
24/06/09 16:45:32 WARN DAGScheduler: Broadcasting large task binary with size 1048.6 KiB
24/06/09 16:45:38 WARN DAGScheduler: Broadcasting large task binary with size 1051.9 KiB
24/06/09 16:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
24/06/09 16:45:48 WARN DAGScheduler: Broadcasting large task binary with size 1936.1 KiB
24/06/09 16:45:49 WAR

RandomForestClassifier Accuracy = 0.73

Weighted Metrics:
  Weighted Precision: 0.74
  Weighted Recall: 0.73
  Weighted F1 Score: 0.72
  Accuracy: 0.73

Evaluating DecisionTreeClassifier...



                                                                                

DecisionTreeClassifier Accuracy = 0.73

Weighted Metrics:
  Weighted Precision: 0.76
  Weighted Recall: 0.73
  Weighted F1 Score: 0.72
  Accuracy: 0.73

Evaluating GBTClassifier...



24/06/09 16:46:55 WARN DAGScheduler: Broadcasting large task binary with size 1020.9 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1063.5 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1062.2 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1062.6 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1063.2 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1064.5 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1066.9 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1071.5 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1080.2 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1095.8 KiB
24/06/09 16:46:56 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
24/06/09 16:46:57 WAR

GBTClassifier Accuracy = 0.73

Weighted Metrics:
  Weighted Precision: 0.75
  Weighted Recall: 0.73
  Weighted F1 Score: 0.72
  Accuracy: 0.73

Evaluating LinearSVC...



TypeError: object of type 'ParamGridBuilder' has no len()

In [148]:
rf = RandomForestClassifier(labelCol="categorie_index", featuresCol="features")

# Define the hyperparameter grid
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

rf_model, rf_weighted_f1_score = evaluate_classifier(
    classifier=rf,
    param_grid=rf_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='RandomForestClassifier'
)


24/06/09 15:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1032.0 KiB
24/06/09 15:27:13 WARN DAGScheduler: Broadcasting large task binary with size 1251.8 KiB
24/06/09 15:27:14 WARN DAGScheduler: Broadcasting large task binary with size 1937.0 KiB
24/06/09 15:27:15 WARN DAGScheduler: Broadcasting large task binary with size 1077.0 KiB
24/06/09 15:27:22 WARN DAGScheduler: Broadcasting large task binary with size 1059.6 KiB
24/06/09 15:27:30 WARN DAGScheduler: Broadcasting large task binary with size 1235.2 KiB
24/06/09 15:27:31 WARN DAGScheduler: Broadcasting large task binary with size 1907.2 KiB
24/06/09 15:27:32 WARN DAGScheduler: Broadcasting large task binary with size 1048.6 KiB
24/06/09 15:27:39 WARN DAGScheduler: Broadcasting large task binary with size 1051.9 KiB
24/06/09 15:27:48 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
24/06/09 15:27:49 WARN DAGScheduler: Broadcasting large task binary with size 1936.1 KiB
24/06/09 15:27:51 WAR

RandomForestClassifier Accuracy = 0.73
RandomForestClassifier Confusion Matrix:
 [[9.112e+03 1.287e+03 5.000e+00 2.400e+01]
 [1.810e+03 3.542e+03 1.000e+00 9.000e+00]
 [9.140e+02 1.190e+02 1.011e+03 5.290e+02]
 [5.260e+02 2.180e+02 2.800e+01 1.015e+03]]

Par Classe Metrics:
 Class 0.0:
  Precision: 0.74
  Recall: 0.87
  F1 Score: 0.80
 Class 1.0:
  Precision: 0.69
  Recall: 0.66
  F1 Score: 0.67
 Class 3.0:
  Precision: 0.64
  Recall: 0.57
  F1 Score: 0.60
 Class 2.0:
  Precision: 0.97
  Recall: 0.39
  F1 Score: 0.56

Weighted Metrics:
  Weighted Precision: 0.74
  Weighted Recall: 0.73
  Weighted F1 Score: 0.72
  Accuracy: 0.73


2. Decision Tree

In [19]:
dt = DecisionTreeClassifier(labelCol="categorie_index", featuresCol="features")

dt_param_grid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [5, 10])\
    .addGrid(dt.impurity, ["gini", "entropy"])\
    .build()

dt_model, rf_weighted_f1_score = evaluate_classifier(
    classifier=dt,
    param_grid=dt_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='DecisionTreeClassifier'
)


DecisionTreeClassifier Accuracy = 0.73
DecisionTreeClassifier Confusion Matrix:
 [[2483.  600.    0.   12.]
 [ 283. 1262.    6.    8.]
 [ 217.   99.  330.  157.]
 [  33.  194.    6.  321.]]

Par Classe Metrics:
 Class 0.0:
  Precision: 0.82
  Recall: 0.80
  F1 Score: 0.81
 Class 1.0:
  Precision: 0.59
  Recall: 0.81
  F1 Score: 0.68
 Class 3.0:
  Precision: 0.64
  Recall: 0.58
  F1 Score: 0.61
 Class 2.0:
  Precision: 0.96
  Recall: 0.41
  F1 Score: 0.58

Weighted Metrics:
  Weighted Precision: 0.76
  Weighted Recall: 0.73
  Weighted F1 Score: 0.73
  Accuracy: 0.73


3. Gradient Boosted Trees

In [20]:
gbt = GBTClassifier(labelCol="categorie_index", featuresCol="features")
ovr = OneVsRest(classifier=gbt, labelCol="categorie_index", featuresCol="features")

gbt_param_grid = ParamGridBuilder()\
    .addGrid(gbt.maxIter, [10, 20])\
    .addGrid(gbt.maxDepth, [5, 10])\
    .build()

gbt_model = evaluate_classifier(
    classifier=ovr,
    param_grid=gbt_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='GBTClassifier'
)


24/06/09 13:33:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/09 13:34:25 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1035.0 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1029.3 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1029.8 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1030.3 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1031.6 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1033.9 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1038.2 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1047.0 KiB
24/06/09 13:34:58 WARN DAGScheduler: Broadcasting large task binary with size 1061.0 KiB
24/06/

GBTClassifier Accuracy = 0.73
GBTClassifier Confusion Matrix:
 [[2.778e+03 3.090e+02 0.000e+00 8.000e+00]
 [5.570e+02 9.990e+02 1.000e+00 2.000e+00]
 [3.080e+02 1.400e+01 3.260e+02 1.550e+02]
 [2.210e+02 1.000e+01 8.000e+00 3.150e+02]]

Par Classe Metrics:
 Class 0.0:
  Precision: 0.72
  Recall: 0.90
  F1 Score: 0.80
 Class 1.0:
  Precision: 0.75
  Recall: 0.64
  F1 Score: 0.69
 Class 3.0:
  Precision: 0.66
  Recall: 0.57
  F1 Score: 0.61
 Class 2.0:
  Precision: 0.97
  Recall: 0.41
  F1 Score: 0.57

Weighted Metrics:
  Weighted Precision: 0.76
  Weighted Recall: 0.73
  Weighted F1 Score: 0.72
  Accuracy: 0.73


4. Support Vector Machine

In [21]:
svm = LinearSVC(labelCol="categorie_index", featuresCol="features")
ovr = OneVsRest(classifier=svm, labelCol="categorie_index", featuresCol="features")

svm_param_grid = ParamGridBuilder()\
    .addGrid(svm.maxIter, [10, 20])\
    .addGrid(svm.regParam, [0.01, 0.1])\
    .build()

svm_model = evaluate_classifier(
    classifier=ovr,
    param_grid=svm_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='LinearSVC'
)

                                                                                

LinearSVC Accuracy = 0.62
LinearSVC Confusion Matrix:
 [[2.577e+03 3.780e+02 1.180e+02 2.200e+01]
 [7.580e+02 7.860e+02 1.000e+01 5.000e+00]
 [2.980e+02 1.600e+02 3.440e+02 1.000e+00]
 [2.250e+02 1.880e+02 1.410e+02 0.000e+00]]

Par Classe Metrics:
 Class 0.0:
  Precision: 0.67
  Recall: 0.83
  F1 Score: 0.74
 Class 1.0:
  Precision: 0.52
  Recall: 0.50
  F1 Score: 0.51
 Class 3.0:
  Precision: 0.00
  Recall: 0.00
  F1 Score: 0.00
 Class 2.0:
  Precision: 0.56
  Recall: 0.43
  F1 Score: 0.49

Weighted Metrics:
  Weighted Precision: 0.55
  Weighted Recall: 0.62
  Weighted F1 Score: 0.58
  Accuracy: 0.62


5. Logistic Regression

In [None]:
lr = LogisticRegression(labelCol="categorie_index", featuresCol="features")

lr_param_grid = ParamGridBuilder()\
    .addGrid(lr.maxIter, [10, 20])\
    .addGrid(lr.regParam, [0.01, 0.1])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

lr_model = evaluate_classifier(
    classifier=lr,
    param_grid=lr_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='LogisticRegression'
)

6. Neural Networks (Multilayer Perceptron Classifier)

In [None]:
num_classes = train_df.select("categorie_index").distinct().count()

mlp = MultilayerPerceptronClassifier(labelCol="categorie_index", featuresCol="features", layers=[6, 5, 4, num_classes])

mlp_param_grid = ParamGridBuilder()\
    .addGrid(mlp.maxIter, [50, 100])\
    .build()

mlp_model = evaluate_classifier(
    classifier=mlp,
    param_grid=mlp_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='MultilayerPerceptronClassifier'
)


7. Naive Bayes

In [None]:
nb = NaiveBayes(labelCol="categorie_index", featuresCol="features")

nb_param_grid = ParamGridBuilder()\
    .addGrid(nb.smoothing, [0.5, 1.0, 1.5])\
    .build()

nb_model = evaluate_classifier(
    classifier=nb,
    param_grid=nb_param_grid,
    train_df=train_df,
    test_df=test_df,
    title='NaiveBayes'
)

# Prediction

- Charger Marketing

In [83]:
from pyhive import hive
import pandas as pd

In [84]:
hive_host = 'localhost'
hive_port = 10000
hive_username = ' ' 
hive_password = ' '

In [85]:
#conn = None
try:
    # Établir une connexion avec authentification LDAP
    conn = hive.Connection(
        host=hive_host,
        port=hive_port,
        username=hive_username,
        password=hive_password,
        auth='LDAP'  
    )
    print("Connecté à Hive avec succès")
except Exception as e:
    print(f"Erreur lors de la connexion à Hive: {e}")

Connecté à Hive avec succès


In [86]:
# Créer un curseur
cursor=conn.cursor()

In [99]:
def findAll(table) :
    # Exécuter une requête pour récupérer les données de la table "catalogue"
    query = "SELECT * FROM " + table
    cursor.execute(query)

    # Charger les résultats dans un DataFrame Pandas
    data = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description] 
    df = spark.createDataFrame(data, schema=columns)
    # df = spark.createDataFrame(pd_df)
    
    return df

In [122]:
marketing_df = findAll("marketing_view")

Prétraiter les Données Marketing

In [123]:
# Renommer les colonnes pour éviter les conflits
for column_name in marketing_df.columns:
    marketing_df = marketing_df.withColumnRenamed(column_name, column_name.split(".")[-1])

In [124]:
marketing_df.show(5)

+---+---+----+----+------------------+----------------+---------------+
| id|age|sexe|taux|situationfamiliale|nbenfantsacharge|deuxiemevoiture|
+---+---+----+----+------------------+----------------+---------------+
| 17| 58|   M|1192|         EN_COUPLE|               0|              0|
|  1| 21|   F|1396|       CELIBATAIRE|               0|              0|
| 15| 60|   M| 524|         EN_COUPLE|               0|              1|
|  4| 26|   F| 420|         EN_COUPLE|               3|              1|
|  7| 59|   F| 572|         EN_COUPLE|               2|              0|
+---+---+----+----+------------------+----------------+---------------+
only showing top 5 rows



Appliquer le Modèle de Prédiction

In [131]:
# Utiliser le modèle formé pour faire des prédictions sur les données Marketing
# predictions = rf_model.transform(marketing_df)

pipeline_model = pipeline.fit(marketing_df)

In [133]:
marketing_df = pipeline_model.transform(marketing_df)

24/06/09 15:17:21 WARN StringIndexerModel: Input column categorie does not exist during transformation. Skip StringIndexerModel for this column.


In [134]:
marketing_df.show(5)

+---+---+----+----+------------------+----------------+---------------+----------+------------------------+--------------------+
| id|age|sexe|taux|situationfamiliale|nbenfantsacharge|deuxiemevoiture|sexe_index|situationfamiliale_index|            features|
+---+---+----+----+------------------+----------------+---------------+----------+------------------------+--------------------+
| 17| 58|   M|1192|         EN_COUPLE|               0|              0|       0.0|                     0.0|(6,[0,1],[58.0,11...|
|  1| 21|   F|1396|       CELIBATAIRE|               0|              0|       1.0|                     1.0|[21.0,1396.0,0.0,...|
| 15| 60|   M| 524|         EN_COUPLE|               0|              1|       0.0|                     0.0|[60.0,524.0,0.0,1...|
|  4| 26|   F| 420|         EN_COUPLE|               3|              1|       1.0|                     0.0|[26.0,420.0,3.0,1...|
|  7| 59|   F| 572|         EN_COUPLE|               2|              0|       1.0|               

In [176]:
predictions = rf_model.transform(marketing_df)

In [177]:
print(categorie_dict)

# Sélectionner les colonnes pertinentes pour l'affichage
for key, value in categorie_dict.items():
    predictions = predictions.withColumn("prediction", when(col('prediction') == key, value).otherwise(col('prediction')))

results = predictions.select("id", "prediction")

results.show()

{0.0: 'Luxe', 1.0: 'Familiale', 3.0: 'Citadine', 2.0: 'Compacte'}


24/06/09 16:03:21 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB
24/06/09 16:03:21 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB


+---+----------+
| id|prediction|
+---+----------+
| 17|      Luxe|
|  1|      Luxe|
| 15|      Luxe|
|  4|  Citadine|
|  7| Familiale|
| 10| Familiale|
| 12|      Luxe|
| 13|      Luxe|
| 14|      Luxe|
| 18|  Citadine|
| 16|  Citadine|
|  2|      Luxe|
|  6| Familiale|
| 19|      Luxe|
| 20|      Luxe|
|  3|      Luxe|
|  5| Familiale|
|  8|      Luxe|
| 11| Familiale|
|  9|      Luxe|
+---+----------+



Exporter les Résultats

In [180]:
# Chemin de sortie pour les résultats
results_hdfs_path = "hdfs:///tpa_groupe_14/results"

# Sauvegarder les résultats dans HDFS
results.write.csv(results_hdfs_path, header=True)

24/06/09 16:06:27 WARN DAGScheduler: Broadcasting large task binary with size 1376.2 KiB
                                                                                

In [181]:
!hadoop fs -ls /tpa_groupe_14/results

Found 3 items
-rw-r--r--   1 vagrant supergroup          0 2024-06-09 16:06 /tpa_groupe_14/results/_SUCCESS
-rw-r--r--   1 vagrant supergroup        109 2024-06-09 16:06 /tpa_groupe_14/results/part-00000-44dd88ee-e137-4e3e-bb89-80e0716ea0b1-c000.csv
-rw-r--r--   1 vagrant supergroup        107 2024-06-09 16:06 /tpa_groupe_14/results/part-00001-44dd88ee-e137-4e3e-bb89-80e0716ea0b1-c000.csv
