In [None]:
# Importation des bibliothèques nécessaires
from pyspark.sql import SparkSession

In [None]:
# Variables
# Paramètres de connexion MySQL
db_url = "jdbc:mysql://mysql-container:3306/wildlens?serverTimezone=UTC"
db_properties = { "user": "root", "password": "root", "driver": "com.mysql.cj.jdbc.Driver" }

In [None]:
# Initialisation de SparkSession
spark = SparkSession.builder \
    .appName("WildLens ETL - MSPR 24-25") \
    .config("spark.jars", "/installation/mysql-connector-j-9.1.0.jar") \
    .getOrCreate()

In [None]:
# Chargement des informations sur les espèces
species_info_path = './data/csv/infos_especes.csv'
species_info_df = spark.read.csv(species_info_path, sep=";",header=True, inferSchema=True)
species_info_df.show(5)


In [None]:
# Remplacer les valeurs manquantes par "valeur manquante"
df_filled = species_info_df.fillna("missing value")

# Identifier les colonnes sans nom
columns_with_name = [col for col in df_filled.columns if col.strip()]

# Identifier les colonnes aberrantes (avec une seule valeur unique)
abnormal_columns = []
for col_name in columns_with_name:
    unique_value_count = df_filled.select(col_name).distinct().count()
    if unique_value_count <= 1:
        abnormal_columns.append(col_name)

# Supprimer les colonnes aberrantes
columns_to_keep = [col for col in columns_with_name if col not in abnormal_columns]
cleaned_df = df_filled.select(*columns_to_keep)

# Afficher un aperçu
cleaned_df.show()


In [None]:
# Sauvegarde au format CSV
cleaned_df.write.csv("./output/data/nettoye.csv", header=True, mode="overwrite")

# Sauvegarde au format Parquet
cleaned_df.write.parquet("./output/data/nettoye.parquet", mode="overwrite")


In [None]:
# Écrire les données dans la table MySQL
cleaned_df.write \
    .jdbc(url=db_url, table="Animaux", mode="overwrite", properties=db_properties)
