In [2]:
# %load_ext autoreload
# %autoreload 2
# %reload_ext autoreload

import os
ROOT_DIR = '/workspace/NN'
os.chdir(ROOT_DIR)

import shutil
import kagglehub
import torch
from pyspark.sql import SparkSession
import socket

dataset_path = os.path.join(ROOT_DIR, 'neural', 'datasets', 'spark', 'test_1')
os.makedirs(dataset_path, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



print(socket.gethostbyname("spark-master"))
driver_host = socket.gethostbyname(socket.gethostname())
print(driver_host)
driver_host = socket.gethostbyname("spark-master")
print(driver_host)
driver_host = "producer"
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--master spark://spark-master:7077
--conf spark.driver.host={driver_host}
--conf spark.driver.port=45555
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'  # путь к Python в контейнере
# os.environ['SPARK_HOME'] = '/opt/spark'            # путь к Spark, укажи реальный
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'

MONGO_USER = os.environ["MONGO_USER"]
MONGO_PASS = os.environ["MONGO_PASSWORD"]
MONGO_ADDR = f"{MONGO_USER}:{MONGO_PASS}@mongodb:27017"  # :27017

def spark_app_generator(name):
    spark = SparkSession.builder \
        .master("spark://spark-master:7077") \
        .appName(name) \
         .config("spark.driver.bindAddress", "0.0.0.0") \
        .config("spark.executor.memory", "1g") \
        .config("spark.driver.memory", "1g") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0") \
    .config("spark.mongodb.read.connection.uri", f"mongodb://{MONGO_ADDR}") \
    .config("spark.mongodb.write.connection.uri", f"mongodb://{MONGO_ADDR}") \
        .getOrCreate()
    return spark  # /openfoodfacts.products


172.21.0.2
172.21.0.5
172.21.0.2


In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType

# Пример определения схемы. Настройте схему под структуру ваших данных.
custom_schema = StructType([
    StructField("_id", StringType(), True),
    StructField("product_name", StringType(), True),
    # Если nutriments представляет собой динамические поля, лучше сохранить его как MapType.
    StructField("nutriments", MapType(StringType(), StringType()), True),
    # Если есть другие поля, укажите их типы.
    StructField("quantity", StringType(), True)
])

spark = spark_app_generator('test_mongo_reading')
df = spark.read.schema(custom_schema).format("mongodb") \
  .options(host="mongo:27017", database="off", collection='products').load()  # , database="off", collection='products'

# Просмотр схемы и первых строк
df.printSchema()
df.show(50)


root
 |-- _id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- nutriments: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- quantity: string (nullable = true)

+------------+--------------------+--------------------+--------------------+
|         _id|        product_name|          nutriments|            quantity|
+------------+--------------------+--------------------+--------------------+
|            |                NULL|                  {}|                NULL|
|    00000000|           erytritol|{potassium_servin...|            150 gram|
|000000000054|Limonade artisana...|                  {}|                NULL|
|000000000063|Mozzarella Schnit...|{fat_100g -> 25, ...|                NULL|
|000000000114|       Chocolate n 3|{fat_100g -> 44, ...|                80 g|
|    00000001|Wild Norwegian El...|{potassium_servin...|  280gr. 320 Kapseln|
|  0000000105|Paleta gran reser...|{fruits-vegetable...|     

In [7]:
# Не забудьте остановить SparkSession по завершении работы
spark.stop()