# Сортировка продуктов по категориям в Pyspark

В PySpark приложении датафреймами(pyspark.sql.DataFrame) заданы продукты, категории и их связи. Каждому продукту может соответствовать несколько категорий или ни одной. А каждой категории может соответствовать несколько продуктов или ни одного. Напишите метод на PySpark, который в одном датафрейме вернет все пары «Имя продукта – Имя категории» и имена всех продуктов, у которых нет категорий. 

## Импортируем библиотеки

In [1]:
from pyspark.sql import SparkSession, DataFrame


In [2]:
spark = SparkSession.builder.appName("pyspark_ds").getOrCreate()

24/07/23 14:00:18 WARN Utils: Your hostname, benji-swiftsf314511 resolves to a loopback address: 127.0.1.1; using 192.168.0.15 instead (on interface wlp0s20f3)
24/07/23 14:00:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/23 14:00:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_category = [{"Category": 'A', "id": 1},
                 {"Category": 'B', "id": 2},
                 {"Category": 'C', "id": 3},
                 {"Category": 'E', "id": 4}
        ]

## Создаем Датафреймы

In [4]:
category_data = spark.createDataFrame([
    (1, "Category A"),
    (2, "Category B"),
    (3, "Category C"),
    (4, "Category D"),],
    ["id", "category_name"],
)


In [5]:
category_data

DataFrame[id: bigint, category_name: string]

In [6]:
category_data.head(4)

                                                                                

[Row(id=1, category_name='Category A'),
 Row(id=2, category_name='Category B'),
 Row(id=3, category_name='Category C'),
 Row(id=4, category_name='Category D')]

In [7]:
product_data = spark.createDataFrame([
    (1, "Product 1"),
    (2, "Product 2"),
    (3, "Product 3"),
    (4, "Product 4"),
    (5, "Product 5"),
    (6, "Product 6"),
    (7, "Product 7"),
    (8, "Product 8"),
    (9, "Product 9"),
    (10, "Product 10"), 
    (11, "Product 11"),
    (12, "Product 12"),
    (13, "Product 13"),
    (14, "Product 14"),
    (15, "Product 15"),],
    ["id", "product_name", ]
)

In [8]:
product_data

DataFrame[id: bigint, product_name: string]

In [9]:
product_data.head(5)

[Row(id=1, product_name='Product 1'),
 Row(id=2, product_name='Product 2'),
 Row(id=3, product_name='Product 3'),
 Row(id=4, product_name='Product 4'),
 Row(id=5, product_name='Product 5')]

## Создаем фрейм со связями

In [10]:
connections_data = spark.createDataFrame([
    (2, 1),
    (3, 3),
    (4, 7),
    (3, 4),
    (2, 4),
    (4, 5),
    (1, 6),
    (3, 7),
    (2, 8),
    (4, 2),
    (1, 8),
    (4, 9),
    (3, 7),
    (2, 12),
    (3, 15)],
    ["category_id", "product_id", ]
)

In [11]:
connections_data

DataFrame[category_id: bigint, product_id: bigint]

In [12]:
connections_data

DataFrame[category_id: bigint, product_id: bigint]

## Создадим результирующий фрейм

In [13]:
data_final = (product_data.join(connections_data, product_data.id == connections_data.product_id, how='left')
    .join(category_data, connections_data.category_id == category_data.id, how='left')
    .select(['category_name', 'product_name'])
)

In [14]:
data_final.orderBy("category_id", "product_id", ).show()

                                                                                

+-------------+------------+
|category_name|product_name|
+-------------+------------+
|         NULL|  Product 10|
|         NULL|  Product 11|
|         NULL|  Product 13|
|         NULL|  Product 14|
|   Category A|   Product 6|
|   Category A|   Product 8|
|   Category B|   Product 1|
|   Category B|   Product 4|
|   Category B|   Product 8|
|   Category B|  Product 12|
|   Category C|   Product 3|
|   Category C|   Product 4|
|   Category C|   Product 7|
|   Category C|   Product 7|
|   Category C|  Product 15|
|   Category D|   Product 2|
|   Category D|   Product 5|
|   Category D|   Product 7|
|   Category D|   Product 9|
+-------------+------------+

