# Imports

In [None]:
import os
import sys
from glob import glob

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Required to use the correct Java version
os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk@17'

# Start Spark App

In [None]:
spark = SparkSession.builder.appName("netflix_analysis").getOrCreate()

# Read CSV file

In [103]:
netflix_df = spark.read.csv('../data/netflix_titles.csv',
                            header=True,  # the first line of the file is a header
                            multiLine=True,  # rows can have break lines
                            # quote = Sets a single character used for escaping quoted values where the separator can be part of the value.
                            quote='"',
                            # escape = Sets a single character used for escaping quotes inside an already quoted value.
                            escape='"'
                            )

In [None]:
netflix_df.show(5)

# Treat Types

In [104]:
netflix_df = netflix_df.withColumns({
    'date_added': F.to_date(F.trim(F.col('date_added')), "MMMM d, yyyy"),
    'release_year': F.col('release_year').cast('integer')
})

In [None]:
netflix_df.printSchema()

# Aggregations

## Count by type

In [None]:
netflix_df.groupby(F.col('type')).agg(
    F.count('*').alias('movies_count')
).show()

# Filtering

## Added after 2020

In [None]:
netflix_df.where("date_added > '2020-01-01'").show(5)

## Count by type, added after 2020

In [105]:
(netflix_df.where("date_added > '2020-01-01'").groupby('type').agg(
    F.count('*').alias('movies_count')
)
 .sort(F.col('type'))
 .show())

+-------+------------+
|   type|movies_count|
+-------+------------+
|  Movie|        2180|
|TV Show|        1087|
+-------+------------+

