In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
# 1. Создаем сессию Spark
spark = SparkSession.builder \
    .appName("StarWars") \
    .getOrCreate()

In [3]:
# 2. Загружаем данные из Parquet файла
characters = spark.read.parquet('parquet_files/characters.parquet')
species = spark.read.parquet('parquet_files/species.parquet')
organizations = spark.read.parquet('parquet_files/organizations.parquet')

In [6]:
# Выводим первые 5 строк таблицы species
species.show(5)

+---+--------------+--------------+-----------+--------------+--------------------+-----------+----------+----------------+--------------+---------+
| id|          name|classification|designation|average_height|         skin_colors|hair_colors|eye_colors|average_lifespan|      language|homeworld|
+---+--------------+--------------+-----------+--------------+--------------------+-----------+----------+----------------+--------------+---------+
|  1|         Human|        Mammal|   Sentient|           1.8|         Light, Dark|    Various|   Various|            79.0|Galactic Basic|  Various|
|  2|Yoda's species|       Unknown|   Sentient|          0.66|               Green|      White|     Brown|           900.0|Galactic Basic|  Unknown|
|  3|       Wookiee|        Mammal|   Sentient|          2.28|               Brown|      Brown|      Blue|           400.0|    Shyriiwook| Kashyyyk|
|  4|        Gungan|     Amphibian|   Sentient|          1.96|              Orange|       None|    Orange|

In [7]:
# Выводим первые 5 строк таблицы characters
characters.show(5)

+---+--------------+--------------+------+------+------+----------+---------+----------+---------+---------+---------+--------------------+
| id|          name|       species|gender|height|weight|hair_color|eye_color|skin_color|year_born|homeworld|year_died|         description|
+---+--------------+--------------+------+------+------+----------+---------+----------+---------+---------+---------+--------------------+
|  1|Luke Skywalker|         Human|  Male|  1.72|  77.0|     Blond|     Blue|     Light|     19.0| Tatooine|     34.0|The main protagon...|
|  2|   Leia Organa|         Human|Female|   1.5|  49.0|     Brown|    Brown|     Light|     19.0| Alderaan|     35.0|A leader in the R...|
|  3|   Darth Vader|         Human|  Male|  2.02| 136.0|      None|   Yellow|      Pale|     41.0| Tatooine|      4.0|The Sith Lord for...|
|  4|          Yoda|Yoda's species|  Male|  0.66|  17.0|     White|    Brown|     Green|    896.0|  Unknown|      4.0|A wise and powerf...|
|  5|      Han Solo|

In [8]:
# Выводим первые 5 строк таблицы organizations
organizations.show(5)

+---+-----------------+-------+---------+--------------------+--------------------+-----------------+--------------------+--------------------+
| id|             name|founded|dissolved|              leader|             members|      affiliation|         description|               films|
+---+-----------------+-------+---------+--------------------+--------------------+-----------------+--------------------+--------------------+
|  1|       Jedi Order| -25000|       19|    Yoda, Mace Windu|Obi-Wan Kenobi, A...|Galactic Republic|An ancient monast...|The Phantom Menac...|
|  2|       Sith Order|  -5000|        4|       Darth Sidious|Darth Vader, Dart...|  Galactic Empire|A sect of Force-s...|The Phantom Menac...|
|  3|  Galactic Empire|     19|        4|   Emperor Palpatine|Darth Vader, Gran...|             None|An authoritarian ...|Revenge of the Si...|
|  4|   Rebel Alliance|     -2|        4|Mon Mothma, Leia ...|Luke Skywalker, H...|     New Republic|An alliance of in...|A New Hope, Th

In [9]:
# Удаляем дубликаты 
characters = characters.dropDuplicates(subset=["name", "species", "homeworld", "year_born"])
species = species.dropDuplicates(subset=["name","classification", "designation", "average_height"])

In [10]:
# Распространенность видов и их классификаций
(characters.groupBy("species")
    .agg(F.count("*").alias("character_count"))
    .orderBy(F.col("character_count").desc())
    .limit(10).show())

+------------+---------------+
|     species|character_count|
+------------+---------------+
|       Human|             47|
|       Droid|              4|
|     Twi'lek|              3|
| Dathomirian|              3|
|     Togruta|              2|
|     Unknown|              2|
|Mon Calamari|              2|
|    Clawdite|              1|
|       Chiss|              1|
|    Besalisk|              1|
+------------+---------------+



In [11]:
# Распространенность классификаций
(species.join(characters, species.name == characters.species, "left") 
    .groupBy(species.classification).agg(F.count("*").alias("total_character_count")) 
    .orderBy(F.col("total_character_count").desc())
    .show())

+--------------+---------------------+
|classification|total_character_count|
+--------------+---------------------+
|        Mammal|                   73|
|     Amphibian|                    6|
|     Reptilian|                    5|
|    Artificial|                    4|
|        Hybrid|                    3|
|       Unknown|                    1|
|     Insectoid|                    1|
|     Gastropod|                    1|
+--------------+---------------------+



In [12]:
# Средний рост по классификации
(species.groupBy("classification")
    .agg(F.round(F.avg("average_height"),1).alias("average_height_class"))
    .orderBy(F.col("average_height_class").desc())
    .show())

+--------------+--------------------+
|classification|average_height_class|
+--------------+--------------------+
|     Gastropod|                 3.9|
|     Amphibian|                 1.9|
|     Insectoid|                 1.8|
|     Reptilian|                 1.8|
|        Mammal|                 1.7|
|        Hybrid|                 1.7|
|       Unknown|                 0.7|
|    Artificial|                NULL|
+--------------+--------------------+



In [13]:
# Члены Ордена Джедаев и Ситхов
# Преобразуем тип колонки
organizations = organizations.withColumn(
    "leader_array", 
    F.split(F.col("leader"), ", ").cast(ArrayType(StringType()))
)
organizations = organizations.withColumn(
    "members_array", 
    F.split(F.col("members"), ", ").cast(ArrayType(StringType()))
)

members = (organizations
           .select("name", F.explode_outer("leader_array").alias("name_member"))
           .filter(F.col("name").isin("Jedi Order", "Sith Order"))
    .union
           (organizations
            .select("name", F.explode_outer("members_array").alias("name_member"))
            .filter(F.col("name").isin("Jedi Order", "Sith Order")))
            )

members.orderBy("name").show()

+----------+----------------+
|      name|     name_member|
+----------+----------------+
|Jedi Order|            Yoda|
|Jedi Order|  Obi-Wan Kenobi|
|Jedi Order|      Mace Windu|
|Jedi Order|Anakin Skywalker|
|Jedi Order|  Luke Skywalker|
|Sith Order|   Darth Sidious|
|Sith Order|     Darth Vader|
|Sith Order|      Darth Maul|
|Sith Order|   Darth Tyranus|
+----------+----------------+



In [14]:
# Топ-5 старейших и топ-5 самых юных персонажей
(characters.select("name", "year_born")
    .dropna(subset=["year_born"])
    .orderBy(F.col("year_born")).limit(5)
    .show())
(characters.select("name", "year_born")
    .dropna(subset=["year_born"])
    .orderBy(F.col("year_born").desc()).limit(5)
    .show())

+-----------+---------+
|       name|year_born|
+-----------+---------+
|General Hux|      0.0|
|Poe Dameron|      2.0|
|   Kylo Ren|      5.0|
|  Rose Tico|     11.0|
|       Finn|     11.0|
+-----------+---------+

+--------------+---------+
|          name|year_born|
+--------------+---------+
|    Maz Kanata|    973.0|
|          Yoda|    896.0|
|Jabba the Hutt|    600.0|
|     Chewbacca|    200.0|
|         C-3PO|    112.0|
+--------------+---------+



In [15]:
# Расчитываем ИМТ
bmi_characters = characters.withColumn("bmi", F.round(F.col("weight")/ (F.col("height"))**2,2))
bmi_characters.orderBy(F.col("bmi").desc()).limit(5).show()
bmi_characters.dropna(subset=["height","weight"]).orderBy(F.col("bmi")).limit(5).show()

+---+--------------+--------------+------+------+------+----------+---------+----------+---------+---------+---------+--------------------+-----+
| id|          name|       species|gender|height|weight|hair_color|eye_color|skin_color|year_born|homeworld|year_died|         description|  bmi|
+---+--------------+--------------+------+------+------+----------+---------+----------+---------+---------+---------+--------------------+-----+
| 42|Jabba the Hutt|          Hutt|  Male|   3.9|1358.0|      None|   Orange|     Green|    600.0|Nal Hutta|      4.0|A notorious crime...|89.28|
| 62|    Pong Krell|      Besalisk|  Male|   2.5| 300.0|      None|   Yellow|     Green|     52.0|     Ojom|     19.0|A Jedi Master who...| 48.0|
|  4|          Yoda|Yoda's species|  Male|  0.66|  17.0|     White|    Brown|     Green|    896.0|  Unknown|      4.0|A wise and powerf...|39.03|
|  3|   Darth Vader|         Human|  Male|  2.02| 136.0|      None|   Yellow|      Pale|     41.0| Tatooine|      4.0|The Si