A DataFrame in Spark is a distributed collection of data organized into named columns. It is similar to a table in a relational database but is immutable (i.e., once created, it cannot be modified).

## Tempview
createOrReplaceTempView` is a method in Apache Spark that allows you to create a temporary view from a DataFrame. This view can then be queried using SQL syntax within the same Spark session.

In [None]:
raw_data.createOrReplaceTempView("disney_shows_view")

In [None]:
spark.sql('''SELECT title,
year,
imdb_rating,
actors
FROM disney_shows_view
limit 10''').show()


In [None]:
spark.sql('''SELECT title,
year,
imdb_rating,
actors,
genre
FROM disney_shows_view
WHERE genre LIKE '%Comedy%'
ORDER BY imdb_rating DESC
limit 10''').show()

In [None]:
# 1.  Write a query to get the average runtime of the movies
# 2. get the top 5 movies released in 2015
# 3. Provide the names of countries and movies they 
# have produced.
# 4. Examine the relationship between genre and runtime
# 5. Most common language 

In [None]:
raw_data.createOrReplaceTempView("disney_shows_view")

In [None]:
df_comedy = spark.sql('''SELECT title,
year,
imdb_rating,
actors,
genre
FROM disney_shows_view
WHERE genre LIKE '%Comedy%'
ORDER BY imdb_rating DESC''')

In [None]:
df_comedy.write \
    .mode('overwrite') \
    .option("path", 'df_comedy') \
    .saveAsTable('comedy_shows')

In [1]:
from pyspark.sql import SparkSession
ss = SparkSession \
    .builder \
    .appName("DF_Analysis") \
    .getOrCreate()

In [2]:
disney_raw = ss.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiline", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load("disney_plus_shows.csv")


In [3]:
#1. How to print Schme
disney_raw.printSchema()

root
 |-- imdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- type: string (nullable = true)
 |-- rated: string (nullable = true)
 |-- year: string (nullable = true)
 |-- released_at: string (nullable = true)
 |-- added_at: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- director: string (nullable = true)
 |-- writer: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- language: string (nullable = true)
 |-- country: string (nullable = true)
 |-- awards: string (nullable = true)
 |-- metascore: string (nullable = true)
 |-- imdb_rating: string (nullable = true)
 |-- imdb_votes: string (nullable = true)



In [4]:
# 2. Displaying a first few rows
disney_raw.show(1, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------------------------------
 imdb_id     | tt0147800                                                                                   
 title       | 10 Things I Hate About You                                                                  
 plot        | A pretty, popular teenager can't go out on a date until her ill-tempered older sister does. 
 type        | movie                                                                                       
 rated       | PG-13                                                                                       
 year        | 1999                                                                                        
 released_at | 31 Mar 1999                                                                                 
 added_at    | November 12, 2019                                                                           
 runtime     | 97 min       

In [5]:
disney_raw.describe().show()

+-------+---------+---------------+--------------------+-------+--------+------------------+-----------+---------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+
|summary|  imdb_id|          title|                plot|   type|   rated|              year|released_at|       added_at|runtime|               genre|            director|              writer|              actors|            language|             country|              awards|         metascore|       imdb_rating|        imdb_votes|
+-------+---------+---------------+--------------------+-------+--------+------------------+-----------+---------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+
|

In [6]:
#3. Basic Statistical Information
df_summary = disney_raw.describe().toPandas()

In [7]:
df_summary

Unnamed: 0,summary,imdb_id,title,plot,type,rated,year,released_at,added_at,runtime,genre,director,writer,actors,language,country,awards,metascore,imdb_rating,imdb_votes
0,count,894,894,894,894,894,894,894,992,894,894,894,894,894,894,894,894,894.0,894.0,894.0
1,mean,,,,,,1994.323076923077,,,,,,,,,,,62.06164383561644,6.656427758816839,347.2451361867704
2,stddev,,,,,,24.130433648285116,,,,,,,,,,,15.776454944690588,1.0203516652049085,290.29838876105833
3,min,tt0019422,'Twas the Night,"""Doc McStuffins"" is an imaginative animated se...",episode,APPROVED,1928,01 Apr 1965,"April 1, 2020",1 h,"Action, Adventure, Biography, Drama, Family","Aaron Blaise, Robert Walker","A.A. Milne (books), Larry Clemmons (story), Ra...","'Weird Al' Yankovic, Sabrina Carpenter, Mekai ...",Brazilian Sign Language,Argentina,1 nomination.,19.0,1.5,1007917.0
4,max,tt9822474,Zootopia,"follows a young boy named Makoto, who gains su...",series,Unrated,2020–,,"October 1, 2019",,"Short, Fantasy",Zhong Yu,"Winston Hibler (story), Erdman Penner (story),...","Émile Genest, John Drainie, Tommy Tweed, Sandr...","Spanish, German","USA, UK, Australia, Canada",Won 7 Primetime Emmys. Another 23 wins & 30 no...,,,


---

## Data Cleansing

In [None]:
from pyspark.sql.functions import col, when

# Convert year to integer and handle incorrect formats
disney_raw = disney_raw.withColumn(
    "year",
    when(col("year").rlike("^\d{4}$"), col("year").cast("integer"))
    .otherwise(None)
)

# Convert 'released_at' to a date type, handling 'N/A' values
disney_raw = disney_raw.withColumn(
    "released_at",
    when(col("released_at").isNotNull() & ~col("released_at").rlike("N/A"), 
         col("released_at").cast("date"))
    .otherwise(None)
)


---

In [None]:
disney_raw.select((disney_raw.columns)[1:7]).last()

In [None]:
# 4. Selecting specific columns
disney_raw.select("title", "year", "imdb_rating").show(5,truncate=False)

In [None]:
# 5. Filtering data
disney_raw.select("title", "year", "imdb_rating") \
          .filter(disney_raw.year > 2017).show(5)

In [None]:
# 6 Filter with multiple conditions
disney_raw.select("title", "year", "imdb_rating") \
          .filter((disney_raw.imdb_rating > 8.0) & (disney_raw.year > 2019)) \
          .show(5)

In [None]:
#7 Renaming cols
df_renamed = disney_raw.withColumnRenamed("imdb_rating", "rating")

In [None]:
#df_renamed.columns

In [None]:
df_renamed.select("title", "year", "rating") \
          .show(5)

In [None]:
# 8. Drop cols
df_dropped_plot = df_renamed.drop("plot")

In [8]:
len(disney_raw.columns)

19

In [None]:
# 9 Drop Multiple cols
df_dropped_2_cols = df_renamed.drop("plot", "rated")

In [None]:
#10 Dropping Rows
df_no_nulls = disney_raw.dropna()

In [None]:
#11 Dropping Rows with nulls in specific cols
df_cleaned = disney_raw. \
            dropna(subset=["title", "imdb_rating"])

In [None]:
# 12. Dropping based on a condition
# filter()



In [None]:
#13 Drop duplicates
df_cleaned = disney_raw.dropDuplicates()

In [None]:
#14 Dropping duplicates in specific cols
df_cleaned = disney_raw \
            .disney_raw \
            .dropDuplicates(["title"])

In [None]:
# 15 Unique
disney_raw.select("title").distinct().count()


In [None]:
# 16 

In [None]:
# 17 Average rating by Year
disney_raw.groupBy("year") \
    .agg({"imdb_rating" : "avg"}) \
    .orderBy("year") \
    .show()


In [None]:
#15 Aggregation and Grouping
from pyspark.sql import functions as F
disney_raw.groupBy("genre") \
    .count() \
    .orderBy(F.desc("count")) \
    .show(10, truncate=False)


In [9]:
from pyspark.sql.functions import desc, col

disney_raw.filter(col("genre").isNotNull()) \
    .groupBy("genre") \
    .count() \
    .orderBy(desc("count")) \
    .show(5, truncate=False)


+---------------------------------------------+-----+
|genre                                        |count|
+---------------------------------------------+-----+
|Documentary                                  |53   |
|Comedy, Family                               |27   |
|Comedy, Drama, Family                        |19   |
|Animation, Short, Comedy, Family             |17   |
|Animation, Adventure, Comedy, Family, Fantasy|17   |
+---------------------------------------------+-----+
only showing top 5 rows



In [None]:
# 20 Add new cols
from pyspark.sql.functions import length
df_added_col = disney_raw.withColumn("title_length",
                      length(disney_raw.title))

In [None]:
df_added_col.select('title', 'title_length') \
            .show(5, truncate=False)

In [None]:
df = disney_raw

In [None]:
# Aggregate functions

In [None]:
# 1. count()
# to count the number of shows
df.select("imdb_id").count()

In [None]:
df.printSchema()

In [None]:
df.printSchema()

In [None]:
# change datatype of imdb_votes col to int
df2 = disney_raw.withColumn("imdb_votes",
                      (disney_raw.imdb_votes).cast("int") )

In [None]:
df2.select(F.sum("imdb_votes")).show()

In [None]:
# sum() : sum of a numerical column
df2.select("imdb_votes").distinct().show(4)


In [None]:
from pyspark.sql import functions  as F
df2.select(F.max("imdb_votes")).show()

In [None]:
df2.select(F.min("imdb_votes")).show()


In [None]:
df2.groupBy("type").count().show()


In [None]:
# approx_count_distinct
1. df2.select(F.approx_count_distinct("genre")).show()

In [None]:
# countDistinct : Uses a lot of memory compared to approx_count_distinct
df2.select(F.countDistinct("genre")).show()

In [None]:
#2. approx_count_distinct
df2.select(F.approx_count_distinct("columnName",
                                   float:relativeStandardDeviation)).show()

df2.select(F.approx_count_distinct("genre", 0.05)).show()

In [None]:
# 3. avg
df2.select(F.avg("imdb_rating")).show()


 4. collect_list(col)
- returns a list of all values from the specified col, including  duplicates

In [None]:
title_list = df2.agg(F.collect_list("title"))

In [None]:
title_list.count()

In [None]:
# 5. corr
df2.select(F.corr("imdb_rating", "imdb_votes")).show()

In [None]:
# covar_pop
df2.select(F.covar_pop("imdb_rating", "imdb_votes")).show()


In [None]:
# covar_samp
df2.select(F.covar_samp("imdb_rating", "imdb_votes")).show()


In [None]:
# first: returns first non-null value of a column

df2.select(F.first("imdb_rating")).show()

In [None]:
# last: returns last non-null value of a column

df2.select(F.last("imdb_rating")).show()

In [None]:
# sumDistinct

sum([10,20,30, 20, 10, 30, 30])

In [None]:
df2.select(F.sum("imdb_votes")).show()


In [None]:
df2.select(F.sum_distinct("imdb_votes")).show()


# df2.select(F.sum("imdb_votes")).show()

In [None]:
#df2.select(F.collect_list("imdb_votes")).show()


df2.groupBy("genre").agg(F.collect_set("imdb_votes").alias('votes_list')).show(100, truncate=False)

In [None]:
# Expressions
# 1. Increase  the rating by 1
df2.withColumn("adjusted_rating", (df2.imdb_rating + 1)) \
            .select("title", "imdb_rating","adjusted_rating").show(2)

In [None]:
# Expressions
# 1. Increase  the rating by 1
df2.withColumn("adjusted_rating", (F.col("imdb_rating") + 1)) \
            .select("title", "imdb_rating","adjusted_rating").show(2)

In [None]:
# 2. substring
df2.select("title", "imdb_rating","").show(2)

In [None]:
df2.withColumn("release_year",
               F.substring("released_at",
                           -4, 4)).select("title",
                                          "imdb_rating",
                                          "release_year",
                                          "released_at").show(2)

In [None]:
df2.withColumn("release_year",
               F.split(("released_at"), " ").getItem(2)).select("title",
                                          "imdb_rating",
                                          "release_year",
                                          "released_at").show(2)

In [None]:
df2.withColumn('rating_category', 
              F.when(F.col("imdb_rating")>=8, "Excellent") 
               .when(F.col("imdb_rating")>=6, "Good")
               .otherwise("Average") \
              ).select("title",
                       "imdb_rating",
                       "rating_category",
                       "released_at").show(2)

In [None]:
# UDFs
from pyspark.sql.types import StringType

def add_prefix(title):
    return "Disney's " + title

add_prefix_udf = F.udf(add_prefix, StringType())

df3 = df2.dropna()
df3.withColumn("modified_title",
               add_prefix_udf(F.col("title"))).select("title",
                        "modified_title",
                       "imdb_rating",
                       "released_at").show(2)

In [None]:
df = raw_data

In [None]:
df.columns

In [None]:
df.select('title',
          'plot',
         'type',
         'rated',
         'year').show(10)