In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Создание SparkSession
spark = SparkSession.builder.appName("DataFrameAPIExample").getOrCreate()

data = [
  {"name": "John", "age": 30, "department": "HR"},
  {"name": "Doe", "age": 25, "department": "Finance"},
  {"name": "Jane", "age": 35, "department": "HR"},
  {"name": "Mark", "age": 40, "department": "Finance"},
  {"name": "Smith", "age": 23, "department": "Engineering"}
]

df = spark.createDataFrame(data)

# Фильтрация данных
filtered_df = df.filter(col("age") > 30)

# Группировка и агрегация данных
grouped_df = df.groupBy("department").agg({"age": "avg", "name": "count"}).withColumnRenamed("avg(age)", "avg_age").withColumnRenamed("count(name)", "count")

# Сортировка данных
sorted_df = grouped_df.orderBy(col("count").desc())

# Показ результатов
# filtered_df.show()
# sorted_df.show()

# Сохранение результирующего DataFrame в CSV файл
# sorted_df.write.csv("config/output_2.csv", header=True)

spark.stop()

In [8]:
from pyspark.sql import SparkSession

# Создание SparkSession
spark = SparkSession.builder.appName("SQLAPIExample").getOrCreate()

data_people = [
  {"name": "John", "age": 30, "department_id": 1},
  {"name": "Doe", "age": 25, "department_id": 2},
  {"name": "Jane", "age": 35, "department_id": 1},
  {"name": "Mark", "age": 40, "department_id": 2},
  {"name": "Smith", "age": 23, "department_id": 3}
]

data_departments = [
  {"id": 1, "department_name": "HR"},
  {"id": 2, "department_name": "Finance"},
  {"id": 3, "department_name": "Engineering"}
]

# Чтение данных из JSON файлов
people_df = spark.createDataFrame(data_people)
departments_df = spark.createDataFrame(data_departments)

# Регистрация DataFrame как временные таблицы
people_df.createOrReplaceTempView("people")
departments_df.createOrReplaceTempView("departments")

# Выполнение JOIN-запроса с использованием SQL
join_df = spark.sql("""
SELECT p.name, p.age, d.department_name
FROM people p
JOIN departments d
ON p.department_id = d.id
""")

# Показ результатов
join_df.show()

# Сохранение результирующего DataFrame в CSV файл
join_df.write.csv("config/output_3.csv", header=True)

spark.stop()

                                                                                

+-----+---+---------------+
| name|age|department_name|
+-----+---+---------------+
| John| 30|             HR|
| Jane| 35|             HR|
|  Doe| 25|        Finance|
| Mark| 40|        Finance|
|Smith| 23|    Engineering|
+-----+---+---------------+



                                                                                