In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.version

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/14 10:51:23 WARN Utils: Your hostname, dmitrii-GH9, resolves to a loopback address: 127.0.1.1; using 192.168.31.45 instead (on interface wlo1)
25/12/14 10:51:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/14 10:51:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/14 10:51:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


'4.0.1'

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
spark = SparkSession.builder.appName("ML-DataFrame").getOrCreate()
# Данные, имитирующие ML-датасет с признаками и метками
data = [
    Row(age=25, height=1.65, income=50000, label=0),
    Row(age=30, height=1.80, income=75000, label=1),
    Row(age=35, height=1.75, income=60000, label=0)
]

df = spark.createDataFrame(data)
df.show()

25/12/13 20:09:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+---+------+------+-----+
|age|height|income|label|
+---+------+------+-----+
| 25|  1.65| 50000|    0|
| 30|   1.8| 75000|    1|
| 35|  1.75| 60000|    0|
+---+------+------+-----+



In [5]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- height: double (nullable = true)
 |-- income: long (nullable = true)
 |-- label: long (nullable = true)



In [6]:
df.select("age", "income").show()

+---+------+
|age|income|
+---+------+
| 25| 50000|
| 30| 75000|
| 35| 60000|
+---+------+



In [7]:
from pyspark.sql.functions import col

df.filter(col("age")>25).show()

+---+------+------+-----+
|age|height|income|label|
+---+------+------+-----+
| 30|   1.8| 75000|    1|
| 35|  1.75| 60000|    0|
+---+------+------+-----+



In [8]:
df.groupBy("label").agg(
    {"income": "sum"}
).show()

+-----+-----------+
|label|sum(income)|
+-----+-----------+
|    0|     110000|
|    1|      75000|
+-----+-----------+



In [9]:
df.orderBy(col("age").desc()).show()

+---+------+------+-----+
|age|height|income|label|
+---+------+------+-----+
| 35|  1.75| 60000|    0|
| 30|   1.8| 75000|    1|
| 25|  1.65| 50000|    0|
+---+------+------+-----+



In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("windowFunctions").getOrCreate()
#Генерация данных
data = [
    ("2023-01-01", "Москва", "Ноутбук", 5, 50000),
    ("2023-01-01", "Москва", "Телефон", 10, 30000),
    ("2023-01-02", "Спб", "Ноутбук", 3, 50080),
    ("2023-01-02", "Спб", "Телефон", 7, 30000),
    ("2023-01-03", "Москва", "Планшет", 2, 40000),
    ("2023-01-03", "Спб", "Планшет", 4, 40000),
]

columns = ["date", "city", "product", "quantity", "price_per_unit"]
sales_df = spark.createDataFrame(data, columns)

#Добавим столбец с выручкой
sales_df = sales_df.withColumn("revenue", col("quantity") * col("price_per_unit"))
sales_df.show()

25/12/13 20:09:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----------+------+-------+--------+--------------+-------+
|      date|  city|product|quantity|price_per_unit|revenue|
+----------+------+-------+--------+--------------+-------+
|2023-01-01|Москва|Ноутбук|       5|         50000| 250000|
|2023-01-01|Москва|Телефон|      10|         30000| 300000|
|2023-01-02|   Спб|Ноутбук|       3|         50080| 150240|
|2023-01-02|   Спб|Телефон|       7|         30000| 210000|
|2023-01-03|Москва|Планшет|       2|         40000|  80000|
|2023-01-03|   Спб|Планшет|       4|         40000| 160000|
+----------+------+-------+--------+--------------+-------+



In [11]:
window_spec = Window.partitionBy("city").orderBy(col("revenue").desc())

In [12]:
sales_df.withColumn("rank", rank().over(window_spec)) \
    .withColumn("dense_rank", dense_rank().over(window_spec)) \
    .show()

+----------+------+-------+--------+--------------+-------+----+----------+
|      date|  city|product|quantity|price_per_unit|revenue|rank|dense_rank|
+----------+------+-------+--------+--------------+-------+----+----------+
|2023-01-01|Москва|Телефон|      10|         30000| 300000|   1|         1|
|2023-01-01|Москва|Ноутбук|       5|         50000| 250000|   2|         2|
|2023-01-03|Москва|Планшет|       2|         40000|  80000|   3|         3|
|2023-01-02|   Спб|Телефон|       7|         30000| 210000|   1|         1|
|2023-01-03|   Спб|Планшет|       4|         40000| 160000|   2|         2|
|2023-01-02|   Спб|Ноутбук|       3|         50080| 150240|   3|         3|
+----------+------+-------+--------+--------------+-------+----+----------+

