In [None]:
from pyspark import *;
from pyspark.sql import *;
from pyspark.sql.functions import *;
import pandas as pd;

In [None]:
spark = SparkSession.builder.appName("Q6").getOrCreate()

In [None]:
df = spark.read.csv("movie_dataset_uncleaned.csv",header=True,inferSchema=True)
df.printSchema()
df.show()

root
 |-- MOVIES: string (nullable = true)
 |-- YEAR: string (nullable = true)
 |-- GENRE: string (nullable = true)
 |-- RATING: double (nullable = true)
 |-- VOTES: integer (nullable = true)
 |-- RunTime: integer (nullable = true)
 |-- Gross: string (nullable = true)

+--------------------+-----------+--------------------+------+-------+-------+-----+
|              MOVIES|       YEAR|               GENRE|RATING|  VOTES|RunTime|Gross|
+--------------------+-----------+--------------------+------+-------+-------+-----+
|       Blood Red Sky|      -2021|Action$ Horror$ T...|   6.1|  21062|    121| NULL|
|Masters of the Un...|   (2021– )|Animation$ Action...|   5.0|  17870|     25| NULL|
|    The Walking Dead|(2010–2022)|Drama$ Horror$ Th...|   8.2| 885805|     44| NULL|
|      Rick and Morty|   (2013– )|Animation$ Advent...|   9.2| 414849|     23| NULL|
|     Army of Thieves|      -2021|Action$ Crime$ Ho...|  NULL|   NULL|   NULL| NULL|
|         Outer Banks|   (2020– )|Action$ Crime$ D

In [None]:
df.select([count(when(isnull(c) | isnan(c), c)).alias(c) for c in df.columns]).show()

+------+----+-----+------+-----+-------+-----+
|MOVIES|YEAR|GENRE|RATING|VOTES|RunTime|Gross|
+------+----+-----+------+-----+-------+-----+
|     0|  13|   94|    64|   64|    103| 1311|
+------+----+-----+------+-----+-------+-----+



In [None]:
df = df.fillna({"RATING":df.select(mean(col("RATING"))).collect()[0][0]})

In [None]:
df = df.fillna({"VOTES":df.select(median(col("VOTES"))).collect()[0][0]})

In [None]:
df = df.fillna({"RunTime":0})
df = df.fillna({'Genre': 'Unknown'})
df = df.fillna({'Gross': '$0.00M'})

In [None]:
df = df.filter((col("Rating")>=1) & (col("Rating")<=10))
df.show()

+--------------------+-----------+--------------------+------+-------+-------+------+
|              MOVIES|       YEAR|               GENRE|RATING|  VOTES|RunTime| Gross|
+--------------------+-----------+--------------------+------+-------+-------+------+
|       Blood Red Sky|      -2021|Action$ Horror$ T...|   6.1|  21062|    121|$0.00M|
|Masters of the Un...|   (2021– )|Animation$ Action...|   5.0|  17870|     25|$0.00M|
|    The Walking Dead|(2010–2022)|Drama$ Horror$ Th...|   8.2| 885805|     44|$0.00M|
|      Rick and Morty|   (2013– )|Animation$ Advent...|   9.2| 414849|     23|$0.00M|
|         Outer Banks|   (2020– )|Action$ Crime$ Drama|   7.6|  25858|     50|$0.00M|
|The Last Letter f...|      -2021|      Drama$ Romance|   6.8|   5283|    110|$0.00M|
|              Dexter|(2006–2013)|Crime$ Drama$ Mys...|   8.6| 665387|     53|$0.00M|
|   Never Have I Ever|   (2020– )|              Comedy|   7.9|  34530|     30|$0.00M|
|        Virgin River|   (2019– )|             Unknown

In [None]:
df = df.filter(col("YEAR").isNotNull())

In [None]:
df = df.withColumn("End_Year", when(col("Year").rlike("^-\d{4}$"),
                                    regexp_extract(col("Year"), r"^-(\d{4})$", 1).cast("int")))
df = df.withColumn("Start_Year", when(col("End_Year").isNotNull(), col("End_Year") - 5))
df.show()


+--------------------+-----------+--------------------+------+-------+-------+------+--------+----------+
|              MOVIES|       YEAR|               GENRE|RATING|  VOTES|RunTime| Gross|End_Year|Start_Year|
+--------------------+-----------+--------------------+------+-------+-------+------+--------+----------+
|       Blood Red Sky|      -2021|Action$ Horror$ T...|   6.1|  21062|    121|$0.00M|    2021|      2016|
|Masters of the Un...|   (2021– )|Animation$ Action...|   5.0|  17870|     25|$0.00M|    NULL|      NULL|
|    The Walking Dead|(2010–2022)|Drama$ Horror$ Th...|   8.2| 885805|     44|$0.00M|    NULL|      NULL|
|      Rick and Morty|   (2013– )|Animation$ Advent...|   9.2| 414849|     23|$0.00M|    NULL|      NULL|
|         Outer Banks|   (2020– )|Action$ Crime$ Drama|   7.6|  25858|     50|$0.00M|    NULL|      NULL|
|The Last Letter f...|      -2021|      Drama$ Romance|   6.8|   5283|    110|$0.00M|    2021|      2016|
|              Dexter|(2006–2013)|Crime$ Drama

In [None]:
df = df.withColumn("Start_Year", when(col("Start_Year").isNull(), regexp_extract(col("Year"), r"(\d{4})", 1).cast("int")).otherwise(col("Start_Year")))
df = df.withColumn("End_Year", when(col("End_Year").isNull(), regexp_extract(col("Year"), r"(\d{4})$", 1).cast("int")).otherwise(col("End_Year")))

In [None]:
df = df.withColumn("Start_Year", when(col("Start_Year").isNull(), col("End_Year") - 5).otherwise(col("Start_Year")))
df = df.withColumn("End_Year", when(col("End_Year").isNull(), col("Start_Year") + 5).otherwise(col("End_Year")))

In [None]:
df.show()

+--------------------+-----------+--------------------+------+-------+-------+------+--------+----------+
|              MOVIES|       YEAR|               GENRE|RATING|  VOTES|RunTime| Gross|End_Year|Start_Year|
+--------------------+-----------+--------------------+------+-------+-------+------+--------+----------+
|       Blood Red Sky|      -2021|Action$ Horror$ T...|   6.1|  21062|    121|$0.00M|    2021|      2016|
|Masters of the Un...|   (2021– )|Animation$ Action...|   5.0|  17870|     25|$0.00M|    2026|      2021|
|    The Walking Dead|(2010–2022)|Drama$ Horror$ Th...|   8.2| 885805|     44|$0.00M|    2015|      2010|
|      Rick and Morty|   (2013– )|Animation$ Advent...|   9.2| 414849|     23|$0.00M|    2018|      2013|
|         Outer Banks|   (2020– )|Action$ Crime$ Drama|   7.6|  25858|     50|$0.00M|    2025|      2020|
|The Last Letter f...|      -2021|      Drama$ Romance|   6.8|   5283|    110|$0.00M|    2021|      2016|
|              Dexter|(2006–2013)|Crime$ Drama

In [None]:
df = df.dropna(subset=["Start_Year", "End_Year"])

In [None]:
df = df.withColumn("Final_Genre", split(col("Genre"), "\$")[0])
df = df.withColumn("Final_Genre", when(col("Final_Genre").isNull(), "unknown").otherwise(col("Final_Genre")))
df.show()

+--------------------+-----------+--------------------+------+-------+-------+------+--------+----------+-----------+
|              MOVIES|       YEAR|               GENRE|RATING|  VOTES|RunTime| Gross|End_Year|Start_Year|Final_Genre|
+--------------------+-----------+--------------------+------+-------+-------+------+--------+----------+-----------+
|       Blood Red Sky|      -2021|Action$ Horror$ T...|   6.1|  21062|    121|$0.00M|    2021|      2016|     Action|
|Masters of the Un...|   (2021– )|Animation$ Action...|   5.0|  17870|     25|$0.00M|    2026|      2021|  Animation|
|    The Walking Dead|(2010–2022)|Drama$ Horror$ Th...|   8.2| 885805|     44|$0.00M|    2015|      2010|      Drama|
|      Rick and Morty|   (2013– )|Animation$ Advent...|   9.2| 414849|     23|$0.00M|    2018|      2013|  Animation|
|         Outer Banks|   (2020– )|Action$ Crime$ Drama|   7.6|  25858|     50|$0.00M|    2025|      2020|     Action|
|The Last Letter f...|      -2021|      Drama$ Romance| 

In [None]:
df = df.toPandas()
df.to_csv("cleaned_6.csv")

In [None]:
spark.stop()