In [2]:
# %load_ext autoreload
# %autoreload 2
# %reload_ext autoreload

import os
ROOT_DIR = '/workspace/NN'
os.chdir(ROOT_DIR)

import shutil
import kagglehub
import torch
from pyspark.sql import SparkSession
import socket

dataset_path = os.path.join(ROOT_DIR, 'neural', 'datasets', 'spark', 'test_1')
os.makedirs(dataset_path, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



print(socket.gethostbyname("spark-master"))
driver_host = socket.gethostbyname(socket.gethostname())
print(driver_host)
driver_host = socket.gethostbyname("spark-master")
print(driver_host)
driver_host = "producer"
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--master spark://spark-master:7077
--conf spark.driver.host={driver_host}
--conf spark.driver.port=45555
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'  # путь к Python в контейнере
# os.environ['SPARK_HOME'] = '/opt/spark'            # путь к Spark, укажи реальный
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'

MONGO_USER = "admin"
MONGO_PASS = "password"
MONGO_ADDR = f"{MONGO_USER}:{MONGO_PASS}@mongodb:27017"  # :27017

def spark_app_generator(name):
    spark = SparkSession.builder \
        .master("spark://spark-master:7077") \
        .appName(name) \
         .config("spark.driver.bindAddress", "0.0.0.0") \
        .config("spark.executor.memory", "1g") \
        .config("spark.driver.memory", "1g") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0") \
    .config("spark.mongodb.read.connection.uri", f"mongodb://{MONGO_ADDR}") \
    .config("spark.mongodb.write.connection.uri", f"mongodb://{MONGO_ADDR}") \
        .getOrCreate()
    return spark  # /openfoodfacts.products


172.21.0.2
172.21.0.5
172.21.0.2


In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType

# Пример определения схемы. Настройте схему под структуру ваших данных.
custom_schema = StructType([
    StructField("_id", StringType(), True),
    StructField("product_name", StringType(), True),
    # Если nutriments представляет собой динамические поля, лучше сохранить его как MapType.
    StructField("nutriments", MapType(StringType(), StringType()), True),
    # Если есть другие поля, укажите их типы.
    StructField("quantity", StringType(), True)
])

spark = spark_app_generator('test_mongo_reading')
df = spark.read.schema(custom_schema).format("mongodb") \
  .options(host="mongo:27017", database="off", collection='products').load()  # , database="off", collection='products'

# Просмотр схемы и первых строк
df.printSchema()
df.show(50)


root
 |-- _id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- nutriments: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- quantity: string (nullable = true)

+------------+--------------------+--------------------+--------------------+
|         _id|        product_name|          nutriments|            quantity|
+------------+--------------------+--------------------+--------------------+
|            |                NULL|                  {}|                NULL|
|    00000000|           erytritol|{potassium_servin...|            150 gram|
|000000000054|Limonade artisana...|                  {}|                NULL|
|000000000063|Mozzarella Schnit...|{fat_100g -> 25, ...|                NULL|
|000000000114|       Chocolate n 3|{fat_100g -> 44, ...|                80 g|
|    00000001|Wild Norwegian El...|{potassium_servin...|  280gr. 320 Kapseln|
|  0000000105|Paleta gran reser...|{fruits-vegetable...|     

In [13]:
# Не забудьте остановить SparkSession по завершении работы
spark.stop()

In [10]:
!pip install pymongo pymongo-schema

Collecting pymongo-schema
  Downloading pymongo_schema-0.4.1-py3-none-any.whl.metadata (16 kB)
Collecting docopt (from pymongo-schema)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ete3 (from pymongo-schema)
  Downloading ete3-3.1.3.tar.gz (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting xlwt (from pymongo-schema)
  Downloading xlwt-1.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting xlsxwriter (from pymongo-schema)
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Collecting openpyxl (from pymongo-schema)
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting future>=0.18.0 (from pymongo-schema)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting et-xmlfile (from openpyxl->pymongo-schema

In [5]:
import pymongo
maxSevSelDelay = 1 # Assume 1ms maximum server selection
client = pymongo.MongoClient(f"mongodb://{MONGO_ADDR}",
                                 serverSelectionTimeoutMS=maxSevSelDelay)

client.server_info()

{'version': '6.0.22',
 'gitVersion': 'ee527360b84c6798535ee0895de3c7186b3522f9',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [6, 0, 22, 0],
 'openssl': {'running': 'OpenSSL 3.0.2 15 Mar 2022',
  'compiled': 'OpenSSL 3.0.2 15 Mar 2022'},
 'buildEnvironment': {'distmod': 'ubuntu2204',
  'distarch': 'x86_64',
  'cc': '/opt/mongodbtoolchain/v3/bin/gcc: gcc (GCC) 8.5.0',
  'ccflags': '-Werror -include mongo/platform/basic.h -ffp-contract=off -fasynchronous-unwind-tables -ggdb -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -fno-omit-frame-pointer -fno-strict-aliasing -O2 -march=sandybridge -mtune=generic -mprefer-vector-width=128 -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-const-variable -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -fdebug-types-section -Wa,--nocompress-debug-sections -fno-builtin-memcmp',
  'cxx': '/opt/mongodbtoolchain/v3

In [7]:
client.list_database_names()
client.off.list_collection_names()

['products']

In [11]:
import pymongo
from pymongo_schema.extract import extract_pymongo_client_schema
schema = extract_pymongo_client_schema(client)
schema

OperationFailure: not authorized on config to execute command { count: "system.sessions", lsid: { id: UUID("beed8ced-129d-4cee-9761-49a81d635588") }, $db: "config" }, full error: {'ok': 0.0, 'errmsg': 'not authorized on config to execute command { count: "system.sessions", lsid: { id: UUID("beed8ced-129d-4cee-9761-49a81d635588") }, $db: "config" }', 'code': 13, 'codeName': 'Unauthorized'}

In [18]:
import pyspark
pyspark.__version__

'3.5.5'