In [1]:
# %load_ext autoreload
# %autoreload 2
# %reload_ext autoreload

import os
ROOT_DIR = '/workspace/NN'
os.chdir(ROOT_DIR)

import shutil
import kagglehub
import torch
from pyspark.sql import SparkSession
import socket

dataset_path = os.path.join(ROOT_DIR, 'neural', 'datasets', 'spark', 'test_1')
os.makedirs(dataset_path, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



print(socket.gethostbyname("spark-master"))
driver_host = socket.gethostbyname(socket.gethostname())
print(driver_host)
driver_host = socket.gethostbyname("spark-master")
print(driver_host)
driver_host = "producer"
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--master spark://spark-master:7077
--conf spark.driver.host={driver_host}
--conf spark.driver.port=45555
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'  # путь к Python в контейнере
# os.environ['SPARK_HOME'] = '/opt/spark'            # путь к Spark, укажи реальный
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'

MONGO_USER = os.environ["MONGO_USER"]
MONGO_PASS = os.environ["MONGO_PASSWORD"]
MONGO_ADDR = f"{MONGO_USER}:{MONGO_PASS}@mongodb:27017"  # :27017

def spark_app_generator(name):
    spark = SparkSession.builder \
        .master("spark://spark-master:7077") \
        .appName(name) \
         .config("spark.driver.bindAddress", "0.0.0.0") \
        .config("spark.executor.memory", "1g") \
        .config("spark.driver.memory", "1g") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0") \
    .config("spark.mongodb.read.connection.uri", f"mongodb://{MONGO_ADDR}") \
    .config("spark.mongodb.write.connection.uri", f"mongodb://{MONGO_ADDR}") \
        .getOrCreate()
    return spark  # /openfoodfacts.products


172.18.0.3
172.22.0.5
172.18.0.3


In [None]:
MONGO_ADDR

In [None]:
from pyspark.sql.types import (StructType, StructField,
                               StringType, IntegerType, MapType,
                               DoubleType
                               )

# Пример определения схемы. Настройте схему под структуру ваших данных.
custom_schema = StructType([
    StructField("_id", StringType(), True),
    StructField("product_name", StringType(), True),
    # Если nutriments представляет собой динамические поля, лучше сохранить его как MapType.
    StructField("nutriments", MapType(StringType(), StringType()), True),
    # Если есть другие поля, укажите их типы.
    StructField("quantity", StringType(), True),
     StructField("ingredients_sweeteners_n", IntegerType(), True),
     StructField("ingredients_percent_analysis", IntegerType(), True),
     StructField("ingredients_non_nutritive_sweeteners_n", IntegerType(), True),
     StructField("ingredients_n", IntegerType(), True),
    StructField("ingredients_from_palm_oil_n", IntegerType(), True),
    StructField("ingredients_from_or_that_may_be_from_palm_oil_n", IntegerType(), True),
    StructField("additives_n", IntegerType(), True),
    StructField("unique_scans_n", IntegerType(), True),
    StructField("scans_n", IntegerType(), True),
    StructField("rev", IntegerType(), True),
    StructField("popularity_key", IntegerType(), True),
    StructField("packagings_n", IntegerType(), True),
    StructField("packagings_complete", IntegerType(), True),
    StructField("nutrition_score_warning_no_fiber", IntegerType(), True),
    StructField("nutrition_score_warning_fruits_vegetables_nuts_estimate", IntegerType(), True),
    StructField("nutrition_score_beverage", IntegerType(), True),
    StructField("nutriscore_score_opposite", IntegerType(), True),
    StructField("nutriscore_score", IntegerType(), True),
    StructField("environmental_score_score", IntegerType(), True),
    StructField("completeness", DoubleType(), True),
    StructField("complete", IntegerType(), True),

])

spark = spark_app_generator('test_mongo_reading')
df = spark.read.schema(custom_schema).format("mongodb") \
  .options(host="mongo:27017", database="off", collection='products').load()  # , database="off", collection='products'

# Просмотр схемы и первых строк
df.printSchema()
df.show(20)


In [3]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

spark = spark_app_generator('test_mongo_data_analysis')
df = spark.read.schema(custom_schema).format("mongodb") \
  .options(host="mongo:27017", database="off", collection='products').load()

# Выбор числовых признаков
numeric_features = ['nutriscore_score', 'environmental_score_score', 'popularity_key', 'popularity_key', "rev"]

# Удаление строк с пропущенными значениями
df_clean = df.select(numeric_features).dropna()

# Сбор признаков в вектор
assembler = VectorAssembler(inputCols=numeric_features, outputCol="features_assembled")
df_vector = assembler.transform(df_clean)

# Масштабирование признаков
scaler = StandardScaler(inputCol="features_assembled", outputCol="features")
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

                                                                                

In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Обучение модели k-средних
kmeans = KMeans().setK(4).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(df_scaled)

# Предсказание кластеров
predictions = model.transform(df_scaled)

# Оценка модели
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette}")

# Центры кластеров
centers = model.clusterCenters()
for idx, center in enumerate(centers):
    print(f"Cluster {idx} center: {center}")

25/04/15 21:09:27 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

Silhouette Score: 0.4595071839606593
Cluster 0 center: [0.23533113 2.1008176  0.00316603 0.00316603 0.52181049]
Cluster 1 center: [ 0.97125113  2.1844216  -2.40970803 -2.40970803  1.36017413]
Cluster 2 center: [1.9046912  1.639037   0.00211394 0.00211394 0.46630675]
Cluster 3 center: [1.0371697  2.11290436 2.08015347 2.08015347 0.99501358]


                                                                                

In [None]:
# Не забудьте остановить SparkSession по завершении работы
spark.stop()