In [2]:
from pyspark.sql import SparkSession
import random
import os

os.environ['PYSPARK_PYTHON'] = 'python'
spark = SparkSession.builder.appName("ProductCategoryExample").getOrCreate()
n_products = 100
n_categories = 11

# Генерация продуктов (1000 шт.)
products = [(i, f"Product_{i}") for i in range(1, n_products)]
df_products = spark.createDataFrame(products, ["product_id", "product_name"])

# Генерация категорий (100 шт.)
categories = [(i, f"Category_{i}") for i in range(1, n_categories)]
df_categories = spark.createDataFrame(categories, ["category_id", "category_name"])

# Генерация связей (произвольное количество)
random.seed(42)
links = []
for product_id in range(1, n_products):
    num_categories = random.randint(0, 5)  # от 0 до 5 категорий на продукт
    chosen_categories = random.sample(range(1, n_categories), num_categories)
    for cat_id in chosen_categories:
        links.append((product_id, cat_id))

df_links = spark.createDataFrame(links, ["product_id", "category_id"])

In [3]:
df_products.show(20)

+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|   Product_1|
|         2|   Product_2|
|         3|   Product_3|
|         4|   Product_4|
|         5|   Product_5|
|         6|   Product_6|
|         7|   Product_7|
|         8|   Product_8|
|         9|   Product_9|
|        10|  Product_10|
|        11|  Product_11|
|        12|  Product_12|
|        13|  Product_13|
|        14|  Product_14|
|        15|  Product_15|
|        16|  Product_16|
|        17|  Product_17|
|        18|  Product_18|
|        19|  Product_19|
|        20|  Product_20|
+----------+------------+
only showing top 20 rows


In [4]:
df_categories.show()

+-----------+-------------+
|category_id|category_name|
+-----------+-------------+
|          1|   Category_1|
|          2|   Category_2|
|          3|   Category_3|
|          4|   Category_4|
|          5|   Category_5|
|          6|   Category_6|
|          7|   Category_7|
|          8|   Category_8|
|          9|   Category_9|
|         10|  Category_10|
+-----------+-------------+



In [5]:
df_links.show(50)

+----------+-----------+
|product_id|category_id|
+----------+-----------+
|         1|          2|
|         1|          1|
|         1|          5|
|         1|         10|
|         1|          7|
|         2|          2|
|         3|          9|
|         3|          2|
|         3|          7|
|         3|          1|
|         3|          8|
|         5|          4|
|         6|         10|
|         6|          1|
|         6|          4|
|         6|          6|
|         7|          9|
|         7|          7|
|         7|          4|
|         7|          8|
|         7|          5|
|         8|          1|
|         8|          3|
|         9|          7|
|         9|          6|
|         9|          5|
|         9|          2|
|         9|         10|
|        10|          2|
|        10|         10|
|        11|          2|
|        11|          6|
|        11|          9|
|        12|          5|
|        12|          1|
|        12|          8|
|        12|         10|


In [6]:
df_products.join(df_links, on="product_id", how="left_anti").show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|         4|   Product_4|
|        13|  Product_13|
|        16|  Product_16|
|        18|  Product_18|
|        26|  Product_26|
|        34|  Product_34|
|        45|  Product_45|
|        47|  Product_47|
|        50|  Product_50|
|        49|  Product_49|
|        57|  Product_57|
|        63|  Product_63|
|        61|  Product_61|
|        67|  Product_67|
|        79|  Product_79|
|        74|  Product_74|
|        87|  Product_87|
|        81|  Product_81|
|        96|  Product_96|
|        90|  Product_90|
+----------+------------+



In [7]:
df_products.join(other=df_links, on='product_id', how='left').join(other=df_categories, on='category_id', how='left').select('product_name', 'category_name').show()

+------------+-------------+
|product_name|category_name|
+------------+-------------+
|   Product_7|   Category_5|
|   Product_7|   Category_8|
|   Product_7|   Category_4|
|   Product_7|   Category_7|
|   Product_7|   Category_9|
|   Product_6|   Category_6|
|   Product_6|   Category_4|
|   Product_6|   Category_1|
|   Product_6|  Category_10|
|   Product_5|   Category_4|
|   Product_1|   Category_7|
|   Product_1|  Category_10|
|   Product_1|   Category_5|
|   Product_1|   Category_1|
|   Product_1|   Category_2|
|   Product_3|   Category_8|
|   Product_3|   Category_1|
|   Product_3|   Category_7|
|   Product_3|   Category_2|
|   Product_3|   Category_9|
+------------+-------------+
only showing top 20 rows


In [8]:
df_products.alias('p').join(other=df_links, on='product_id', how='left').join(other=df_categories, on='category_id', how='left').select('product_name', 'category_name').show()

+------------+-------------+
|product_name|category_name|
+------------+-------------+
|   Product_7|   Category_5|
|   Product_7|   Category_8|
|   Product_7|   Category_4|
|   Product_7|   Category_7|
|   Product_7|   Category_9|
|   Product_6|   Category_6|
|   Product_6|   Category_4|
|   Product_6|   Category_1|
|   Product_6|  Category_10|
|   Product_5|   Category_4|
|   Product_1|   Category_7|
|   Product_1|  Category_10|
|   Product_1|   Category_5|
|   Product_1|   Category_1|
|   Product_1|   Category_2|
|   Product_3|   Category_8|
|   Product_3|   Category_1|
|   Product_3|   Category_7|
|   Product_3|   Category_2|
|   Product_3|   Category_9|
+------------+-------------+
only showing top 20 rows
