In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import max, avg, min, count, concat_ws, sum
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc
from pyspark.sql.functions import when, datediff, current_date

In [2]:
spark = (
    SparkSession.builder
    .appName("Pagila_spark")
    .master("local[*]")
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.4")
    .getOrCreate()
)

In [3]:
spark

In [4]:
url = "jdbc:postgresql://localhost:5432/postgres"
properties = {
    "user": "postgres", 
    "password": "123456", 
    "driver": "org.postgresql.Driver"
}

In [5]:
def read_table(url, name, properties):
    df = spark.read.jdbc(
        url=url,
        table=name,
        properties=properties
    )
    return df

In [6]:
tables_names = "(select table_name from information_schema.tables where table_schema='public'and table_type='BASE TABLE') as names"
df_tables_names = read_table(url, tables_names, properties)
df_tables_names.show()

+----------------+
|      table_name|
+----------------+
|      film_actor|
|         address|
|            city|
|           actor|
|        category|
|         country|
|        customer|
|            film|
|   film_category|
|       inventory|
|        language|
|          rental|
|         payment|
|           store|
|payment_p2022_02|
|payment_p2022_07|
|payment_p2022_04|
|payment_p2022_05|
|           staff|
|payment_p2022_01|
+----------------+
only showing top 20 rows



In [7]:
df_film_actor = read_table(url, "film_actor", properties)
df_address = read_table(url, "address", properties)
df_city = read_table(url, "city", properties)
df_actor = read_table(url, "actor", properties)
df_category = read_table(url, "category", properties)
df_country = read_table(url, "country", properties)
df_customer = read_table(url, "customer", properties)
df_film = read_table(url, "film", properties)
df_film_category = read_table(url, "film_category", properties)
df_inventory = read_table(url, "inventory", properties)
df_language = read_table(url, "language", properties)
df_rental = read_table(url, "rental", properties)
df_payment = read_table(url, "payment", properties)
df_store = read_table(url, "store", properties)
df_staff = read_table(url, "staff", properties)

## Вывести количество фильмов в каждой категории, отсортировать по убыванию

In [8]:
df_res_q1 = df_category.join(df_film_category, df_category["category_id"] == df_film_category["category_id"])
df_res_q1 = df_res_q1.groupBy("name").agg(count("film_id").alias("films_count")).orderBy(desc("films_count"))
df_res_q1.show()

+-----------+-----------+
|       name|films_count|
+-----------+-----------+
|      Drama|        152|
|      Music|        152|
|     Travel|        151|
|    Foreign|        150|
|      Games|        150|
|   Children|        150|
|     Action|        149|
|     Sci-Fi|        149|
|  Animation|        148|
|     Family|        147|
|   Classics|        147|
|        New|        147|
|     Sports|        145|
|Documentary|        145|
|     Comedy|        143|
|     Horror|        142|
+-----------+-----------+



## Вывести 10 актеров, чьи фильмы большего всего арендовали, отсортировать по убыванию

In [19]:
df_res_q2 = (
    df_actor.join(df_film_actor, on=["actor_id"])
    .join(df_inventory, df_film_actor["film_id"] == df_inventory["film_id"])
    .join(df_rental, df_inventory["inventory_id"] == df_rental["inventory_id"])
)
  
df_res_q2 = (
    df_res_q2.groupBy(col("actor_id"), concat_ws(" ", col("first_name"), col("last_name")).alias("name"))
    .agg(count("rental_id").alias("rental_count"))
    .orderBy(desc("rental_count"))
    .limit(10)
) 

df_res_q2.show()

+--------+------------------+------------+
|actor_id|              name|rental_count|
+--------+------------------+------------+
|     107|    GINA DEGENERES|         753|
|     181|    MATTHEW CARREY|         678|
|     198|       MARY KEITEL|         674|
|     144|ANGELA WITHERSPOON|         654|
|     102|       WALTER TORN|         640|
|      60|       HENRY BERRY|         612|
|     150|       JAYNE NOLTE|         611|
|      37|        VAL BOLGER|         605|
|      23|     SANDRA KILMER|         604|
|      90|      SEAN GUINESS|         599|
+--------+------------------+------------+



## Вывести категорию фильмов, на которую потратили больше всего денег

In [10]:
df_res_q3 = (
    df_category.join(df_film_category, df_category["category_id"] == df_film_category["category_id"])
    .join(df_inventory, df_film_category["film_id"] == df_inventory["film_id"])
    .join(df_rental, df_inventory["inventory_id"] == df_rental["inventory_id"])
    .join(df_payment, df_rental["rental_id"] == df_payment["rental_id"])
)
df_res_q3 = (
    df_res_q3.groupBy("name")
    .agg(sum("amount").alias("total_cost"))
    .orderBy(desc("total_cost"))
    .limit(1)
)


In [11]:
df_res_q3.show()

+-------+----------+
|   name|total_cost|
+-------+----------+
|Foreign|  10507.67|
+-------+----------+



## Вывести названия фильмов, которых нет в inventory. Написать запрос без использования оператора IN

In [12]:
df_res_q4 = df_film.join(df_inventory, df_film["film_id"] == df_inventory["film_id"], "left")
df_res_q4 = df_res_q4.select("title").filter(col("inventory_id").isNull())

In [13]:
df_res_q4.show()

+--------------------+
|               title|
+--------------------+
|      CHOCOLATE DUCK|
|       BUTCH PANTHER|
|        VOLUME HOUSE|
|      ORDER BETRAYED|
|        TADPOLE PARK|
|    KILL BROTHERHOOD|
|FRANKENSTEIN STRA...|
|    CROSSING DIVORCE|
|    SUICIDES SILENCE|
|       CATCH AMISTAD|
|     PERDITION FARGO|
|       FLOATS GARDEN|
|           GUMP DATE|
|        WALLS ARTIST|
|  GLADIATOR WESTWARD|
|         HOCUS FRIDA|
|ARSENIC INDEPENDENCE|
|         MUPPET MILE|
|   FIREHOUSE VIETNAM|
|       ROOF CHAMPION|
+--------------------+
only showing top 20 rows



In [14]:
df_res_q4.count()

42

## Вывести топ 3 актеров, которые больше всего появлялись в фильмах в категории “Children”. Если у нескольких актеров одинаковое кол-во фильмов, вывести всех

In [15]:
df_res_q5 = (
    df_actor.alias("a")
    .join(df_film_actor.alias("fa"), col("a.actor_id") == col("fa.actor_id"))
    .join(df_film_category.alias("fc"), col("fa.film_id") == col("fc.film_id"))
    .join(df_category.alias("c"), col("fc.category_id") == col("c.category_id"))
)

df_res_q5 = (
    df_res_q5
    .filter(col("c.name") == "Children")  
    .groupBy(concat_ws(" ", col("a.first_name"), col("a.last_name")).alias("actor_name"))  
    .agg(count(col("fc.film_id")).alias("films_count"))
    .orderBy(desc("films_count"))
    .limit(3)
)

In [16]:
df_res_q5.show()

+------------+-----------+
|  actor_name|films_count|
+------------+-----------+
|SIDNEY CROWE|          9|
|RICHARD PENN|          9|
|EWAN GOODING|          9|
+------------+-----------+



## Вывести города с количеством активных и неактивных клиентов (активный — customer.active = 1). Отсортировать по количеству неактивных клиентов по убыванию

In [17]:
df_res_q6 = (
    df_city.join(df_address, df_city["city_id"] == df_address["city_id"])
    .join(df_customer, df_address["address_id"] == df_customer["address_id"])
)

df_res_q6 = (
    df_res_q6.groupBy("city")
    .agg(
        sum(when(col("active") == 1, 1).otherwise(0)).alias("active_clients"),
        sum(when(col("active") == 0, 1).otherwise(0)).alias("inactive_clients")
    )
    .orderBy(col("inactive_clients").desc())
)

In [18]:
df_res_q6.show()

+------------------+--------------+----------------+
|              city|active_clients|inactive_clients|
+------------------+--------------+----------------+
|          Uluberia|             0|               1|
|         Najafabad|             0|               1|
|         Pingxiang|             0|               1|
|          Xiangfan|             0|               1|
|        Kumbakonam|             0|               1|
|       Szkesfehrvr|             0|               1|
|  Charlotte Amalie|             0|               1|
|            Kamyin|             0|               1|
|            Daxian|             0|               1|
|     Coatzacoalcos|             0|               1|
|           Wroclaw|             0|               1|
|            Ktahya|             0|               1|
|           Bat Yam|             0|               1|
|   Southend-on-Sea|             0|               1|
|            Amroha|             0|               1|
|A Corua (La Corua)|             1|           

## Вывести категорию фильмов, у которой самое большое кол-во часов суммарной аренды в городах (customer.address_id в этом city), и которые начинаются на букву “a”. Тоже самое сделать для городов в которых есть символ “-”