In [1]:
!pip install pyspark==3.5.5

[0m

# Run nested containers

```bash
 docker-compose \
  -f .\docker\entrypoint.docker-compose.yml \
  -f .\docker\spark\spark.docker-compose.yml \
  --env-file=./env/.env  \
  up  \
      spark-master \
      spark-worker-1 \
      spark-worker-2 \
      producer \
      mongodb \
      redis \
      neo4j \
      --build
```

# Spark driver init

In [1]:
# %load_ext autoreload
# %autoreload 2
# %reload_ext autoreload

import os

ROOT_DIR = '/workspace/NN'
os.chdir(ROOT_DIR)

import torch
from pyspark.sql import SparkSession
import socket

dataset_path = os.path.join(ROOT_DIR, 'neural', 'datasets', 'spark', 'test_1')
weight_path = os.path.join(ROOT_DIR, 'neural', 'weights', 'lab_5', 'kmeans')
os.makedirs(dataset_path, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

driver_host = "producer"
os.environ["JAVA_HOME"] = os.environ["K8S_JAVA_HOME"]
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--conf spark.executor.memory={os.environ['SPARK_EXECUTOR_MEMORY']}
--conf spark.executor.cores={os.environ['SPARK_EXECUTOR_CORES']}
pyspark-shell
"""

neo4j_url = f'bolt://{os.environ["NEO4J_HOST"]}:{os.environ["NEO4J_PORT"]}'
neo4j_user = os.environ["NEO4J_USER"]
neo4j_pass = os.environ["NEO4J_PASSWORD"]
MONGO_USER = os.environ["MONGO_USER"]
MONGO_PASS = os.environ["MONGO_PASSWORD"]
MONGO_ADDR = f"{MONGO_USER}:{MONGO_PASS}@mongodb:27017"  # :27017

REDIS_HOST = os.environ["REDIS_HOST"]
REDIS_PORT = os.environ["REDIS_PORT"]
REDIS_DB = os.environ["REDIS_DB"]


def spark_app_generator(name):
    spark = SparkSession.builder \
        .master(os.environ["SPARK_MASTER"]) \
        .appName(name) \
        .config("spark.driver.bindAddress", os.environ['SPARK_DRIVER_BINDADDRES']) \
        .config("spark.executor.memory", os.environ['SPARK_EXECUTOR_MEMORY']) \
        .config("spark.driver.memory", os.environ['SPARK_DRIVER_MEMORY']) \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0,"
                                       "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.1_for_spark_3,"
                                       "com.redislabs:spark-redis_2.12:3.1.0") \
        .config("spark.mongodb.read.connection.uri", f"mongodb://{MONGO_ADDR}") \
        .config("spark.mongodb.write.connection.uri", f"mongodb://{MONGO_ADDR}") \
        .config("spark.redis.host", REDIS_HOST) \
        .config("spark.redis.port", REDIS_PORT) \
        .config("spark.redis.db", REDIS_DB) \
        .config("spark.neo4j.bolt.url", neo4j_url) \
        .config("spark.neo4j.bolt.user", neo4j_user) \
        .config("spark.neo4j.bolt.password", neo4j_pass) \
        .config("neo4j.url", neo4j_url) \
        .config("neo4j.authentication.basic.username", neo4j_user) \
        .config("neo4j.authentication.basic.password", neo4j_pass) \
        .getOrCreate()

    # .config("spark.hadoop.hadoop.home.dir", "/hadoop/dfs/name") \
    # .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    # .config("spark.hadoop.dfs.client.use.namenode.hostname", "true") \
    print(spark.sparkContext.getConf().get("spark.jars.packages"))
    return spark  # /openfoodfacts.products


172.22.0.4
172.22.0.5
172.22.0.4


## Create dataframe schema

In [None]:
from pyspark.sql.types import (StructType, StructField,
                               StringType, IntegerType, MapType,
                               DoubleType
                               )

# Пример определения схемы. Настройте схему под структуру ваших данных.
custom_schema = StructType([
    StructField("_id", StringType(), True),
    StructField("product_name", StringType(), True),
    # Если nutriments представляет собой динамические поля, лучше сохранить его как MapType.
    StructField("ingredients_n", IntegerType(), nullable=True),
    StructField("ingredients_sweeteners_n", IntegerType(), nullable=True),
    StructField("scans_n", IntegerType(), nullable=True),
    StructField("additives_n", IntegerType(), nullable=True),

])

spark = spark_app_generator('lab_7')

---
## This code running in the data mart service

```scala
package com.example.datamart.transform

import org.apache.spark.ml.feature.{StandardScaler, VectorAssembler}
import org.apache.spark.sql.{DataFrame, functions => F}

object Preprocessing {
  def transform(df: DataFrame): DataFrame = {
    val selected = df.select("ingredients_n","ingredients_sweeteners_n","scans_n","additives_n")
      .na.drop()

    val assembler = new VectorAssembler()
      .setInputCols(Array("ingredients_n","ingredients_sweeteners_n","scans_n","additives_n"))
      .setOutputCol("features_assembled")

    val assembled = assembler.transform(selected)

    val scaler = new StandardScaler()
      .setInputCol("features_assembled")
      .setOutputCol("features")
      .fit(assembled)

    scaler.transform(assembled).select("features")            # :contentReference[oaicite:11]{index=11}
  }
}
```

```scala
package com.example.datamart

import org.apache.spark.sql.types._

import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{SaveMode, SparkSession}

object DataMartApp {
  def main(args: Array[String]): Unit = {
    // Load configuration from environment or application.conf
    val mongoUri = sys.env("MONGO_URI")
    val neo4jUrl = sys.env("NEO4J_URL")
    val neo4jUser = sys.env("NEO4J_USER")
    val neo4jPass = sys.env("NEO4J_PASS")

    // Initialize SparkSession with MongoDB and Neo4j settings
    val spark = SparkSession.builder()
      .appName("DataMartApp")
      .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.1_for_spark_3")
      .config("neo4j.url", neo4jUrl) // Замените на ваш адрес Neo4j
      .config("neo4j.authentication.basic.username", neo4jUser)
      .config("neo4j.authentication.basic.password", neo4jPass)
      .config("spark.mongodb.read.connection.uri", mongoUri)
      .config("spark.neo4j.bolt.url", neo4jUrl)
      .config("spark.neo4j.bolt.user", neo4jUser)
      .config("spark.neo4j.bolt.password", neo4jPass)
      .getOrCreate()


    val customSchema = StructType(Seq(
      StructField("_id", StringType, true),
      StructField("product_name", StringType, true),
      StructField("ingredients_n", IntegerType, nullable = true),
      StructField("ingredients_sweeteners_n", IntegerType, nullable = true),
      StructField("scans_n", IntegerType, nullable = true),
      StructField("additives_n", IntegerType, nullable = true),
    ))

    // Read raw data from MongoDB
    val rawDF = spark.read.format("mongodb").schema(customSchema).option("database", "off").option("collection", "products").option("uri", "mongodb://mongo:27017").option("sql.inferSchema.mapTypes.enabled", "true").option("sql.inferSchema.mapTypes.minimum.key.size", "10").load()

    // Data preprocessing: select and drop nulls
    val cleanDF = rawDF
      .select(
        rawDF("_id"),
        rawDF("product_name"),
        rawDF("ingredients_n"),
        rawDF("ingredients_sweeteners_n"),
        rawDF("scans_n"),
        rawDF("additives_n")
      )
      .na.drop()

    // Write cleaned data as Neo4j nodes labeled 'CleanedProduct'
    cleanDF.write
      .format("org.neo4j.spark.DataSource")
      .mode(SaveMode.Overwrite)
      .option("labels", "CleanedProduct")
      .option("node.keys", "_id")
      .save()

    spark.stop()
  }
}
```

## Load clean data from data-mart (middle) db

In [2]:

df = spark.read.format("org.neo4j.spark.DataSource") \
    .option("labels", "CleanedProduct") \
    .load()  # , database="off", collection='products'

# Просмотр схемы и первых строк
print(df.count())
df.printSchema()
df.show(20)




:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
com.redislabs#spark-redis_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1df7e46a-3032-4ae5-b5fc-8df227feec1f;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;10.3.0 in central
	found org.mongodb#mongodb-driver-sync;4.8.2 in central
	[4.8.2] org.mongodb#mongodb-driver-sync;[4.8.1,4.8.99)
	found org.mongodb#bson;4.8.2 in central
	found org.mongodb#mongodb-driver-core;4.8.2 in central
	found org.mongodb#bson-record-codec;4.8.2 in central
	found org.neo4j#neo4j-connector-apache-spark_2.12;5.3.1_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.12_common;5.3.1 in central
	found org.neo4j.driver#neo4j-java-driver;4.4.17 in central
	found org.reactivestreams#reactive-streams

org.mongodb.spark:mongo-spark-connector_2.12:10.3.0,org.neo4j:neo4j-connector-apache-spark_2.12:5.3.1_for_spark_3,com.redislabs:spark-redis_2.12:3.1.0


25/04/26 20:03:25 WARN SchemaService: Switching to query schema resolution
                                                                                

500435
root
 |-- <id>: long (nullable = false)
 |-- <labels>: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- scans_n: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- ingredients_sweeteners_n: long (nullable = true)
 |-- ingredients_n: long (nullable = true)
 |-- additives_n: long (nullable = true)
 |-- _id: string (nullable = true)



                                                                                

+----+----------------+-------+--------------------+------------------------+-------------+-----------+-------------+
|<id>|        <labels>|scans_n|        product_name|ingredients_sweeteners_n|ingredients_n|additives_n|          _id|
+----+----------------+-------+--------------------+------------------------+-------------+-----------+-------------+
|   0|[CleanedProduct]|      1|      Genovese pesto|                       0|           17|          0|0011110899378|
|   1|[CleanedProduct]|      2|        Marshmallows|                       0|            9|          3|0015400023931|
|   2|[CleanedProduct]|      3|    Meatless Chorizo|                       0|           20|          1|0011110129345|
|   3|[CleanedProduct]|      2|                Cola|                       0|            6|          2|0011110496805|
|   4|[CleanedProduct]|      1|Wrap Poulet à la ...|                       0|           51|          5|     00099332|
|   5|[CleanedProduct]|      1|        Berry Medley|    

## Learn KMean

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

# Сбор признаков в вектор
numeric_features_2 = ['ingredients_n', 'ingredients_sweeteners_n', 'scans_n', 'additives_n', ]
assembler = VectorAssembler(inputCols=numeric_features_2, outputCol="features_assembled")
df_vector = assembler.transform(df)

# Масштабирование признаков
scaler = StandardScaler(inputCol="features_assembled", outputCol="features")
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# df_small = df_scaled.limit(100)
# Обучение модели k-средних
kmeans = KMeans().setK(4).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(df_scaled)



[Stage 10:>                                                         (0 + 1) / 1]

## Get cluster center

In [None]:
# Предсказание кластеров
predictions = model.transform(df_scaled)
result = predictions.select(
    "_id",
    "product_name",
    'ingredients_n', 'ingredients_sweeteners_n', 'scans_n', 'additives_n',
    "features", "prediction"
)
# result.show()
# Оценка модели
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette}")

# Центры кластеров
centers = model.clusterCenters()
for idx, center in enumerate(centers):
    print(f"Cluster {idx} center: {center}")

In [None]:
result.show(10)

## Save results

In [None]:

result \
    .write \
    .format("org.apache.spark.sql.redis") \
    .option("table", "results_lab_7") \
    .option("key.column", "_id") \
    .mode("overwrite") \
    .save()

In [6]:
# Не забудьте остановить SparkSession по завершении работы
spark.stop()