In [1]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os
import socket

# Charger les variables d'environnement
load_dotenv("/home/jovyan/work/.env")

# Récupérer les variables d'environnement
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PASSWORD")
db_host = "postgres_dvf"
db_port = os.getenv("DB_PORT", "5432")
db_name = os.getenv("POSTGRES_DB")

# Obtenir l'adresse IP du conteneur jupyter_lab dans le réseau Docker
def get_docker_ip():
    hostname = socket.gethostname()
    ip_address = socket.gethostbyname(hostname)
    return ip_address

docker_ip = get_docker_ip()
print(f"Adresse IP du conteneur jupyter_lab: {docker_ip}")

# Initialisation de la session Spark
spark = SparkSession.builder \
    .appName("LectureDelinquanceIDF") \
    .config("spark.master", "spark://spark:7077") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .config("spark.executor.memory", "512m") \
    .config("spark.executor.cores", "1") \
    .config("spark.driver.memory", "512m") \
    .config("spark.executor.instances", "1") \
    .config("spark.driver.cores", "1") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.driver.host", docker_ip) \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .getOrCreate()

# Test de connexion
print("Version de Spark :", spark.version)

# Lecture de la table delinquance_idf depuis PostgreSQL
try:
    df = spark.read \
        .format("jdbc") \
        .option("url", f"jdbc:postgresql://{db_host}:{db_port}/{db_name}") \
        .option("dbtable", "delinquance_idf") \
        .option("user", db_user) \
        .option("password", db_password) \
        .option("driver", "org.postgresql.Driver") \
        .load()

    # Afficher les 5 premières lignes
    print("Aperçu des données (delinquance_idf) :")
    df.limit(5).show()

except Exception as e:
    print(f"❌ Erreur lors de la lecture de la table : {e}")


Adresse IP du conteneur jupyter_lab: 172.22.0.6
Version de Spark : 3.5.0
Aperçu des données (delinquance_idf) :
+-----------+-----+--------------------+---------------+--------+---------------+-----------+---------+-------------------+---------+-------------------+----------------------+--------------------+
|CODGEO_2025|annee|          indicateur|unite_de_compte|  nombre|taux_pour_mille|est_diffuse|insee_pop|insee_pop_millesime|insee_log|insee_log_millesime|complement_info_nombre|complement_info_taux|
+-----------+-----+--------------------+---------------+--------+---------------+-----------+---------+-------------------+---------+-------------------+----------------------+--------------------+
|      75056| 2016|Violences physiqu...|        Victime| 3307.00|           NULL|       diff|  2190327|               2016|  1374377|               2016|                  NULL|                NULL|
|      75056| 2016|Violences physiqu...|        Victime| 9738.00|           NULL|       diff|  2

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test2") \
    .config("spark.master", "spark://172.19.0.2:7077") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .config("spark.driver.host", "127.0.0.1") \
    .getOrCreate()

# Test de connexion
print(spark.version)


3.5.0


In [4]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

# Charger les variables d'environnement
load_dotenv("/home/jovyan/work/.env")

# Récupérer les variables d'environnement
mongo_user = os.getenv("MONGO_INITDB_ROOT_USERNAME")
mongo_password = os.getenv("MONGO_INITDB_ROOT_PASSWORD")
mongo_host = "mongodb_delinquance"
mongo_port = os.getenv("MONGO_PORT", "27017")
mongo_db = os.getenv("MONGO_DB")
mongo_collection = "idf_2023_2024"  # Remplace par le nom de ta collection

# Initialisation de la session Spark avec le connecteur MongoDB
spark = SparkSession.builder \
    .appName("MongoSparkConnector") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()
# Test de connexion
print(spark.version)




3.5.0


In [14]:
uri = "mongodb://nosql_user:MSEBDEnoSql@mongodb_delinquance:27017/base_delinquance_nosql.idf_2023_2024?authSource=admin"


# Lire les données depuis MongoDB
df = spark.read.format("mongo") \
    .option("uri", uri) \
    .load()

df.limit(5).show()


+-------+--------+--------+--------+--------+-----+------------+----------------+-----------+---------------+---------+------------------+-------------+--------------+-----------------------+---------------+---------+--------------+-------+-------+--------------+-------------------------+------------------+-------+-------------------------+--------------------------+--------------------------+--------------------------+--------------------------+-------------------+---------------+------------+-----------+---------------+----------------+--------------------+-----+
|1er lot|2eme lot|3eme lot|4eme lot|5eme lot|B/T/Q|Code commune|Code departement|Code postal|Code type local|Code voie|           Commune|Date mutation|Nature culture|Nature culture speciale|Nature mutation|No Volume|No disposition|No plan|No voie|Nombre de lots|Nombre pieces principales|Prefixe de section|Section|Surface Carrez du 1er lot|Surface Carrez du 2eme lot|Surface Carrez du 3eme lot|Surface Carrez du 4eme lot|Surfa