In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Window

In [2]:
# 1. Создаем сессию Spark
spark = SparkSession.builder \
    .appName("UserSessions_Netflix") \
    .getOrCreate()

In [3]:
schema = StructType((
            StructField("title", StringType(), True),
            StructField("type", StringType(), True),
            StructField("genres", StringType(), True),
            StructField("releaseYear", IntegerType(), True),
            StructField("imdbId", StringType(), True),
            StructField("imdbAverageRating", DoubleType(), True),
            StructField("imdbNumVotes", IntegerType(), True),
            StructField("availableCountries", StringType(), True)
))

In [4]:
# Считываем со схемой
df = spark.read.option("header", True).csv("netflix_data.csv", schema= schema)

In [5]:
# Преобразуем типы со структурой array
df = df.withColumn("genres", F.split(F.col("genres"), ", "))
df = df.withColumn("availableCountries", F.split(F.col("availableCountries"), ", "))

In [6]:
df.show()

+--------------------+-----+--------------------+-----------+---------+-----------------+------------+------------------+
|               title| type|              genres|releaseYear|   imdbId|imdbAverageRating|imdbNumVotes|availableCountries|
+--------------------+-----+--------------------+-----------+---------+-----------------+------------+------------------+
|               Ariel|movie|[Comedy, Crime, D...|       1988|tt0094675|              7.4|        9240|              NULL|
| Shadows in Paradise|movie|[Comedy, Drama, M...|       1986|tt0092149|              7.4|        8074|              NULL|
|        Forrest Gump|movie|    [Drama, Romance]|       1994|tt0109830|              8.8|     2392180|              NULL|
|     American Beauty|movie|             [Drama]|       1999|tt0169547|              8.3|     1252895|              NULL|
|   The Fifth Element|movie|[Action, Adventur...|       1997|tt0119116|              7.6|      529085|              NULL|
|             Jarhead|mo

In [7]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- releaseYear: integer (nullable = true)
 |-- imdbId: string (nullable = true)
 |-- imdbAverageRating: double (nullable = true)
 |-- imdbNumVotes: integer (nullable = true)
 |-- availableCountries: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [8]:
# Статистика до чистки
print(f"Статистика до чистки:")
df.summary().show()

Статистика до чистки:
+-------+---------------------+-------------+------------------+---------+------------------+------------------+
|summary|                title|         type|       releaseYear|   imdbId| imdbAverageRating|      imdbNumVotes|
+-------+---------------------+-------------+------------------+---------+------------------+------------------+
|  count|                21183|        21910|             21870|    20275|             20096|             20096|
|   mean|             1263.387|         NULL|2013.3624142661179|   2024.0| 6.399945262738879| 32507.63350915605|
| stddev|    902.2972086431237|         NULL|14.319377116372104|     NULL|1.0900045754995984|122414.34841806859|
|    min| "'Tis Time for ""...| "" Princess"|              1913|     2024|               1.0|                 5|
|    25%|                100.0|         NULL|              2011|   2024.0|               5.7|               335|
|    50%|               1899.0|         NULL|              2018|   2024.0|

In [9]:
# Удаляем дубликаты
df = df.dropDuplicates(["title", "releaseYear", "imdbId"])

In [10]:
# Удаляем пропуски
df = df.dropna(subset=["imdbAverageRating","genres"])

In [11]:
print(f"Статистика после чистки:")
df.summary().show()

Статистика после чистки:
+-------+--------------------+-----+------------------+---------+-----------------+------------------+
|summary|               title| type|       releaseYear|   imdbId|imdbAverageRating|      imdbNumVotes|
+-------+--------------------+-----+------------------+---------+-----------------+------------------+
|  count|               20090|20090|             20090|    20090|            20090|             20090|
|   mean|   1238.296896551724| NULL|2012.9651567944252|     NULL|6.399552015928372| 32516.14225983076|
| stddev|   907.5554049560967| NULL|14.656896362088187|     NULL|1.089722188501761|122431.57097925746|
|    min|"Gabriel ""Fluffy...|movie|              1913|tt0003014|              1.0|                 5|
|    25%|               100.0| NULL|              2011|     NULL|              5.7|               335|
|    50%|              1899.0| NULL|              2018|     NULL|              6.5|              1626|
|    75%|              1991.0| NULL|            

In [12]:
# Вывод первых 10 строк
df.show(10, truncate=False)

+--------------------------------------------------------------+-----+------------------------------+-----------+----------+-----------------+------------+------------------+
|title                                                         |type |genres                        |releaseYear|imdbId    |imdbAverageRating|imdbNumVotes|availableCountries|
+--------------------------------------------------------------+-----+------------------------------+-----------+----------+-----------------+------------+------------------+
|"Gabriel ""Fluffy"" Iglesias: One Show Fits All"              |movie|[Comedy]                      |2019       |tt9426212 |7.3              |2285        |NULL              |
|"The Most Notorious ""Talker"" Runs the World's Greatest Clan"|tv   |[Action, Adventure, Animation]|2024       |tt32501772|7.3              |897         |NULL              |
|#AAY                                                          |movie|[Comedy]                      |2024       |tt28359277|6

In [13]:
# Анализ соотношения фильмов и сериалов по годам
(df.groupBy("releaseYear")
    .pivot("type")
    .agg(F.count("title").alias("total_titles"))
    .orderBy(F.col("releaseYear").desc())
    .limit(10)
    .show()
)

+-----------+-----+---+
|releaseYear|movie| tv|
+-----------+-----+---+
|       2025|  147|184|
|       2024|  844|494|
|       2023| 1176|511|
|       2022| 1271|574|
|       2021|  954|465|
|       2020|  874|409|
|       2019| 1010|389|
|       2018| 1022|348|
|       2017|  833|239|
|       2016|  680|178|
+-----------+-----+---+



In [14]:
# Анализ соотношения фильмов и сериалов по жанрам
(df.select(F.explode_outer(F.col("genres")).alias("genre"), "type")
    .groupBy("genre")
    .pivot("type")
    .agg(F.count("type").alias("count"))
    .orderBy((F.col("movie") + F.col("tv")).desc())
    .limit(15)
    .show()
)

+-----------+-----+----+
|      genre|movie|  tv|
+-----------+-----+----+
|      Drama| 7509|2208|
|     Comedy| 6034|1378|
|     Action| 2709| 761|
|      Crime| 2310| 773|
|    Romance| 2370| 662|
|  Adventure| 1772| 696|
|   Thriller| 1798| 298|
|Documentary| 1447| 540|
|  Animation|  982| 998|
|    Mystery| 1054| 384|
|     Horror| 1128| 125|
|     Family|  816| 249|
|    Fantasy|  698| 306|
|  Biography|  873| 126|
|    History|  525| 223|
+-----------+-----+----+



In [15]:
# Популярность жанров на IMDb
(df.select(F.explode_outer(F.col("genres")).alias("genre"), "imdbAverageRating", "imdbNumVotes")
    .groupBy("genre")
    .agg(F.round(F.avg("imdbAverageRating"),2).alias("average_rating"), F.sum("imdbNumVotes").alias("total_votes"))
    .orderBy(F.col("average_rating").desc())
    .limit(10)
    .show()
)

+------------------+--------------+-----------+
|             genre|average_rating|total_votes|
+------------------+--------------+-----------+
|Action & Adventure|           7.0|        222|
|         Biography|          6.93|   43144041|
|       Documentary|          6.93|    6721103|
|           History|          6.92|   17775960|
|         Animation|          6.79|   38227002|
|         Talk-Show|          6.76|     115327|
|               War|          6.72|   11575840|
|             Sport|          6.72|    9147143|
|             Music|          6.65|   11650484|
|              News|          6.64|      14014|
+------------------+--------------+-----------+



In [16]:
# Динамика выпуска контента по годам и рейтинг
(df.groupBy("releaseYear")
    .agg(F.count("*").alias("total_titles"), F.round(F.avg("imdbAverageRating"),2).alias("average_rating"))
    .orderBy(F.col("releaseYear").desc())
    .limit(10)
    .show()
    )

+-----------+------------+--------------+
|releaseYear|total_titles|average_rating|
+-----------+------------+--------------+
|       2025|         331|          6.44|
|       2024|        1338|          6.28|
|       2023|        1687|          6.27|
|       2022|        1845|          6.26|
|       2021|        1419|          6.33|
|       2020|        1283|          6.33|
|       2019|        1399|          6.42|
|       2018|        1370|          6.41|
|       2017|        1072|          6.46|
|       2016|         858|          6.52|
+-----------+------------+--------------+



In [17]:
# Анализ динамики прироста фильмов по годам
df_window = (df.groupBy("releaseYear")
.agg(F.count("*").alias("current_year_title_count"))
.withColumn("previous_year_title_count", F.lag("current_year_title_count").over(Window.orderBy("releaseYear")))
.orderBy(F.col("releaseYear").desc()))
df_window = df_window.withColumn("title_count_growth", F.col("current_year_title_count") - F.col("previous_year_title_count"))
df_window.orderBy(F.col("title_count_growth").desc()).limit(10).show()

+-----------+------------------------+-------------------------+------------------+
|releaseYear|current_year_title_count|previous_year_title_count|title_count_growth|
+-----------+------------------------+-------------------------+------------------+
|       2022|                    1845|                     1419|               426|
|       2018|                    1370|                     1072|               298|
|       2017|                    1072|                      858|               214|
|       2016|                     858|                      710|               148|
|       2021|                    1419|                     1283|               136|
|       2015|                     710|                      590|               120|
|       2013|                     513|                      404|               109|
|       2014|                     590|                      513|                77|
|       2003|                     214|                      170|            

In [18]:
# Поиск "Скрытых Жемчужин" (High Rating, Low Votes)
(df.filter((F.col("imdbNumVotes") > 300) & (F.col("imdbNumVotes") < 10000))
.orderBy(F.col("imdbAverageRating").desc()).show(10,truncate=False))

+-------------------+-----+---------------------------------+-----------+----------+-----------------+------------+------------------+
|title              |type |genres                           |releaseYear|imdbId    |imdbAverageRating|imdbNumVotes|availableCountries|
+-------------------+-----+---------------------------------+-----------+----------+-----------------+------------+------------------+
|Flavours of Romania|tv   |[Documentary]                    |2017       |tt13094256|9.4              |781         |NULL              |
|Khawatir           |tv   |[Documentary, Family, Reality-TV]|2005       |tt3062514 |9.1              |4500        |NULL              |
|Running Man        |tv   |[Comedy, Game-Show, Reality-TV]  |2010       |tt2185037 |9.1              |3912        |NULL              |
|My Dearest         |tv   |[Drama, History, Romance]        |2023       |tt26228190|8.9              |3367        |NULL              |
|Nirvana in Fire    |tv   |[Action, Drama, Fantasy]    