In [1]:
!hadoop fs -ls /tpa_groupe_14/data/co2

Found 1 items
-rw-r--r--   1 vagrant supergroup      38916 2024-06-09 19:27 /tpa_groupe_14/data/co2/CO2.csv


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("TPT-HADOOP_MAP_REDUCE") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
def load_data_in_csv_file(filename,separator,head):
    # This path is based on this instruction `2. Upload all resulting csv files to HDFS.`
    df = spark.read.options(delimiter=separator, header=head).csv(filename)

    # Display schema and first five rows of the DataFrame
    print("Schema and first rows in", filename)
    print("Count : ",df.count())
    df.printSchema()
    df.show(5)
    
    return df

## Chargement de données

In [4]:
# Prendre CO2.csv deppuis hdfs
co2_hdfs_df = load_data_in_csv_file("/tpa_groupe_14/data/co2/CO2.csv",",", True)

                                                                                

Schema and first rows in /tpa_groupe_14/data/co2/CO2.csv
Count :  437
root
 |-- _c0: string (nullable = true)
 |-- Marque / Modele: string (nullable = true)
 |-- Bonus / Malus: string (nullable = true)
 |-- Rejets CO2 g/km: string (nullable = true)
 |-- Cout enerie: string (nullable = true)

+---+--------------------+-------------+---------------+-----------+
|_c0|     Marque / Modele|Bonus / Malus|Rejets CO2 g/km|Cout enerie|
+---+--------------------+-------------+---------------+-----------+
|  2|AUDI E-TRON SPORT...|    -6 000€ 1|              0|      319 €|
|  3|AUDI E-TRON SPORT...|    -6 000€ 1|              0|      356 €|
|  4|AUDI E-TRON 55 (4...|    -6 000€ 1|              0|      357 €|
|  5|AUDI E-TRON 50 (3...|    -6 000€ 1|              0|      356 €|
|  6|       BMW i3 120 Ah|    -6 000€ 1|              0|      204 €|
+---+--------------------+-------------+---------------+-----------+
only showing top 5 rows



24/06/09 19:31:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Marque / Modele, Bonus / Malus, Rejets CO2 g/km, Cout enerie
 Schema: _c0, Marque / Modele, Bonus / Malus, Rejets CO2 g/km, Cout enerie
Expected: _c0 but found: 
CSV file: hdfs://localhost:9000/tpa_groupe_14/data/co2/CO2.csv


In [5]:
# Prendre catalogue deppuis hdfs, les données cache de Hive
catalogue_hive_df = load_data_in_csv_file("/user/hive/warehouse/catalogue",'\t',False)
catalogue_hive_df.cache()

Schema and first rows in /user/hive/warehouse/catalogue
Count :  270
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)

+---+-----+------+---+-----------+---+---+-----+-----+-----+
|_c0|  _c1|   _c2|_c3|        _c4|_c5|_c6|  _c7|  _c8|  _c9|
+---+-----+------+---+-----------+---+---+-----+-----+-----+
|  1|Volvo|S80 T6|272|très longue|  5|  5|blanc|false|50500|
|  2|Volvo|S80 T6|272|très longue|  5|  5| noir|false|50500|
|  3|Volvo|S80 T6|272|très longue|  5|  5|rouge|false|50500|
|  4|Volvo|S80 T6|272|très longue|  5|  5| gris| true|35350|
|  5|Volvo|S80 T6|272|très longue|  5|  5| bleu| true|35350|
+---+-----+------+---+-----------+---+---+-----+-----+-----+
only showing top 5 rows



DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string]

### Ajouter une colonne marque dans CO2

In [6]:
from pyspark.sql.functions import col

# Prendre les "marque" présent dans catalogue
marque_catalogue_df = catalogue_hive_df.select(col("_c1")).distinct()

In [7]:
# Renommer "_c1" en "Marque"
marque_catalogue_df = marque_catalogue_df.withColumnRenamed("_c1", "Marque")
marque_catalogue_df.show(5)
# Store all the "marque" in catalogue
marque_catalogue_df.cache()

+----------+
|    Marque|
+----------+
|Volkswagen|
|   Peugeot|
|    Jaguar|
|    Lancia|
|      Mini|
+----------+
only showing top 5 rows



DataFrame[Marque: string]

In [8]:
print("Nombre de marque : " ,marque_catalogue_df.count())

Nombre de marque :  21


In [9]:
from pyspark.sql.functions import lower

# Prendre "marque" de Catalogue et le mettre dans "CO2" 
# Enlever la colonne "Marque / Modele"
join_df = co2_hdfs_df.join(marque_catalogue_df, lower(co2_hdfs_df["Marque / Modele"]).\
                                  contains(lower(marque_catalogue_df["Marque"])), "inner")

co2_marque_df = join_df.drop("Marque / Modele")

In [10]:
co2_marque_df.show(5)

+---+-------------+---------------+-----------+------+
|_c0|Bonus / Malus|Rejets CO2 g/km|Cout enerie|Marque|
+---+-------------+---------------+-----------+------+
|  2|    -6 000€ 1|              0|      319 €|  Audi|
|  3|    -6 000€ 1|              0|      356 €|  Audi|
|  4|    -6 000€ 1|              0|      357 €|  Audi|
|  5|    -6 000€ 1|              0|      356 €|  Audi|
|  6|    -6 000€ 1|              0|      204 €|   BMW|
+---+-------------+---------------+-----------+------+
only showing top 5 rows



24/06/09 19:32:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Marque / Modele, Bonus / Malus, Rejets CO2 g/km, Cout enerie
 Schema: _c0, Marque / Modele, Bonus / Malus, Rejets CO2 g/km, Cout enerie
Expected: _c0 but found: 
CSV file: hdfs://localhost:9000/tpa_groupe_14/data/co2/CO2.csv


### Formatage de valeurs dans CO2

In [11]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Fonction pour rectifier les valeurs de "Bonus / Malus" et "Cout enerie"
# Convertir ses valeurs en nombres
def clean_number(value):
    string_value = value
    if '-' in value and '€' not in value:
        return 0
    if '€' in value :
        string_value = value.split("€", 1)[0]
    number = ''.join(filter(lambda x: x.isdigit() or x == '-' or x == '+', string_value))
    return int(number) if number else 0

clean_value_udf = udf(clean_number, StringType())

co2_valid_df = co2_marque_df.\
                    withColumn("Bonus / Malus", clean_value_udf(co2_marque_df["Bonus / Malus"])).\
                    withColumn("Cout enerie", clean_value_udf(co2_marque_df["Cout enerie"]))

In [12]:
co2_valid_df.cache()
co2_valid_df.show(5)
print("CO2 valide : ", co2_valid_df.count())

24/06/09 19:32:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Marque / Modele, Bonus / Malus, Rejets CO2 g/km, Cout enerie
 Schema: _c0, Marque / Modele, Bonus / Malus, Rejets CO2 g/km, Cout enerie
Expected: _c0 but found: 
CSV file: hdfs://localhost:9000/tpa_groupe_14/data/co2/CO2.csv
                                                                                

+---+-------------+---------------+-----------+------+
|_c0|Bonus / Malus|Rejets CO2 g/km|Cout enerie|Marque|
+---+-------------+---------------+-----------+------+
|  2|        -6000|              0|        319|  Audi|
|  3|        -6000|              0|        356|  Audi|
|  4|        -6000|              0|        357|  Audi|
|  5|        -6000|              0|        356|  Audi|
|  6|        -6000|              0|        204|   BMW|
+---+-------------+---------------+-----------+------+
only showing top 5 rows

CO2 valide :  389


### Valuers de CO2 pour tous marque de catalogue

In [13]:
from pyspark.sql.functions import avg

# Calculer les moyennes de "Bonus / Malus" , "Rejets CO2 g/km", "Cout enerie" regroupé par "Marque"
co2_marque_avg_df = co2_valid_df.groupBy("Marque") \
                                   .agg(avg("Bonus / Malus").alias("Bonus / Malus"), \
                                        avg("Rejets CO2 g/km").alias("Rejets CO2 g/km"), \
                                        avg("Cout enerie").alias("Cout enerie"))

In [14]:
# Calculer les moyennes de "Bonus / Malus" , "Rejets CO2 g/km", "Cout enerie" de tous les lignes
co2_all_avg_df = co2_valid_df.select(avg("Bonus / Malus").alias("Bonus / Malus"),\
                           avg("Rejets CO2 g/km").alias("Rejets CO2 g/km"),\
                           avg("Cout enerie").alias("Cout enerie"))


In [15]:
# Afficher le resultat
print("Average by 'Marque'")
co2_marque_avg_df.show(5)
print("Average by 'Marque' : ", co2_marque_avg_df.count())

print("Average of all 'Marque'")
co2_all_avg_df.show()

co2_marque_avg_df.cache()
co2_all_avg_df.cache()

Average by 'Marque'
+----------+-------------------+------------------+------------------+
|    Marque|      Bonus / Malus|   Rejets CO2 g/km|       Cout enerie|
+----------+-------------------+------------------+------------------+
|Volkswagen|-1714.2857142857142|23.428571428571427|              96.0|
|   Peugeot|            -3000.0|15.833333333333334|144.16666666666666|
|    Jaguar|            -6000.0|               0.0|             271.0|
|       Kia|            -4000.0|10.333333333333334|157.66666666666666|
|      Mini|            -3000.0|              21.5|             126.0|
+----------+-------------------+------------------+------------------+
only showing top 5 rows

Average by 'Marque' :  13
Average of all 'Marque'
+-----------------+------------------+-----------------+
|    Bonus / Malus|   Rejets CO2 g/km|      Cout enerie|
+-----------------+------------------+-----------------+
|5671.586118251928|152.25192802056554|611.6735218508998|
+-----------------+------------------+

DataFrame[Bonus / Malus: double, Rejets CO2 g/km: double, Cout enerie: double]

In [16]:
# Prendre "Marque" présent dans "Catalogue" mais non dans "CO2"
marque_only_catalogue_df = marque_catalogue_df\
                    .join(co2_marque_avg_df, marque_catalogue_df["Marque"] == co2_marque_avg_df["Marque"], "left_anti")
print("Marque not in CO2 but in Cataloque: ", marque_only_catalogue_df.count())
marque_only_catalogue_df.show(5)

Marque not in CO2 but in Cataloque:  8
+------+
|Marque|
+------+
|Lancia|
|  Saab|
| Honda|
|  Seat|
|  Ford|
+------+
only showing top 5 rows



In [17]:
# Créer CO2 pour les "marque" non présent dans CO2 
co2_marque_catalogue_avg_df = marque_only_catalogue_df.crossJoin(co2_all_avg_df)

In [18]:
# Union de tous CO2
co2_all_marque_catalogue = co2_marque_avg_df.union(co2_marque_catalogue_avg_df)

# Renommer colonnes
co2_all_marque_catalogue = co2_all_marque_catalogue.withColumnRenamed("Marque", "marque")
co2_all_marque_catalogue = co2_all_marque_catalogue.withColumnRenamed("Bonus / Malus", "bonusmalus")
co2_all_marque_catalogue = co2_all_marque_catalogue.withColumnRenamed("Rejets CO2 g/km", "rejetco2")
co2_all_marque_catalogue = co2_all_marque_catalogue.withColumnRenamed("Cout enerie", "coutenergie")

co2_all_marque_catalogue.show(5)

+----------+-------------------+------------------+------------------+
|    marque|         bonusmalus|          rejetco2|       coutenergie|
+----------+-------------------+------------------+------------------+
|Volkswagen|-1714.2857142857142|23.428571428571427|              96.0|
|   Peugeot|            -3000.0|15.833333333333334|144.16666666666666|
|    Jaguar|            -6000.0|               0.0|             271.0|
|       Kia|            -4000.0|10.333333333333334|157.66666666666666|
|      Mini|            -3000.0|              21.5|             126.0|
+----------+-------------------+------------------+------------------+
only showing top 5 rows



In [19]:
co2_all_marque_catalogue_tx = co2_all_marque_catalogue.withColumn("bonusmalus", col("bonusmalus").cast("string")) \
                                                    .withColumn("rejetco2", col("rejetco2").cast("string")) \
                                                      .withColumn("coutenergie", col("coutenergie").cast("string"))


co2_all_marque_catalogue_tx.show(5)

+----------+-------------------+------------------+------------------+
|    marque|         bonusmalus|          rejetco2|       coutenergie|
+----------+-------------------+------------------+------------------+
|Volkswagen|-1714.2857142857142|23.428571428571427|              96.0|
|   Peugeot|            -3000.0|15.833333333333334|144.16666666666666|
|    Jaguar|            -6000.0|               0.0|             271.0|
|       Kia|            -4000.0|10.333333333333334|157.66666666666666|
|      Mini|            -3000.0|              21.5|             126.0|
+----------+-------------------+------------------+------------------+
only showing top 5 rows



### Intégration de CO2 dans catalogue

In [20]:
catalogue_hive_df.show(5)

+---+-----+------+---+-----------+---+---+-----+-----+-----+
|_c0|  _c1|   _c2|_c3|        _c4|_c5|_c6|  _c7|  _c8|  _c9|
+---+-----+------+---+-----------+---+---+-----+-----+-----+
|  1|Volvo|S80 T6|272|très longue|  5|  5|blanc|false|50500|
|  2|Volvo|S80 T6|272|très longue|  5|  5| noir|false|50500|
|  3|Volvo|S80 T6|272|très longue|  5|  5|rouge|false|50500|
|  4|Volvo|S80 T6|272|très longue|  5|  5| gris| true|35350|
|  5|Volvo|S80 T6|272|très longue|  5|  5| bleu| true|35350|
+---+-----+------+---+-----------+---+---+-----+-----+-----+
only showing top 5 rows



In [21]:
catalogue_co2_df = catalogue_hive_df.\
        join(co2_all_marque_catalogue_tx, \
             catalogue_hive_df["_c1"] == co2_all_marque_catalogue_tx["marque"], "inner")

In [22]:
catalogue_co2_df.show(5)

+---+-----+------+---+-----------+---+---+-----+-----+-----+------+----------+-----------------+-----------------+
|_c0|  _c1|   _c2|_c3|        _c4|_c5|_c6|  _c7|  _c8|  _c9|marque|bonusmalus|         rejetco2|      coutenergie|
+---+-----+------+---+-----------+---+---+-----+-----+-----+------+----------+-----------------+-----------------+
|  1|Volvo|S80 T6|272|très longue|  5|  5|blanc|false|50500| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  2|Volvo|S80 T6|272|très longue|  5|  5| noir|false|50500| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  3|Volvo|S80 T6|272|très longue|  5|  5|rouge|false|50500| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  4|Volvo|S80 T6|272|très longue|  5|  5| gris| true|35350| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  5|Volvo|S80 T6|272|très longue|  5|  5| bleu| true|35350| Volvo|       0.0|42.45454545454545|72.72727272727273|
+---+-----+------+---+-----------+---+---+-----+-----+-----+------+----------+--

In [23]:
print("Catalogue : ", catalogue_co2_df.count())

                                                                                

Catalogue :  270


In [24]:
catalogue_co2_df = catalogue_co2_df.drop("_c1")

In [25]:
catalogue_co2_df.show(5)

+---+------+---+-----------+---+---+-----+-----+-----+------+----------+-----------------+-----------------+
|_c0|   _c2|_c3|        _c4|_c5|_c6|  _c7|  _c8|  _c9|marque|bonusmalus|         rejetco2|      coutenergie|
+---+------+---+-----------+---+---+-----+-----+-----+------+----------+-----------------+-----------------+
|  1|S80 T6|272|très longue|  5|  5|blanc|false|50500| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  2|S80 T6|272|très longue|  5|  5| noir|false|50500| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  3|S80 T6|272|très longue|  5|  5|rouge|false|50500| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  4|S80 T6|272|très longue|  5|  5| gris| true|35350| Volvo|       0.0|42.45454545454545|72.72727272727273|
|  5|S80 T6|272|très longue|  5|  5| bleu| true|35350| Volvo|       0.0|42.45454545454545|72.72727272727273|
+---+------+---+-----------+---+---+-----+-----+-----+------+----------+-----------------+-----------------+
only showing top 5 

In [26]:
catalogue_co2_df.write.csv("/tpa_groupe_14/mapreduce/", header=False)

                                                                                

In [27]:
catalogue_hive_df.unpersist()
marque_catalogue_df.unpersist()
co2_valid_df.unpersist()
co2_marque_avg_df.unpersist()
co2_all_avg_df.unpersist()
spark.stop()