In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import*
from pyspark.sql.window import Window
from pyspark.sql.functions import*
spark=SparkSession.builder.appName('Neflix_data').getOrCreate()


In [0]:
raw_titles_schema = StructType([
    StructField("id", StringType(), True),                    # Unique ID (String)
    StructField("title", StringType(), True),                 # Title of the media (String)
    StructField("type", StringType(), True),                  # Type of content (e.g., Movie, Series) (String)
    StructField("release_year", IntegerType(), True),         # Release year (Integer)
    StructField("age_certification", StringType(), True),     # Age rating (String)
    StructField("runtime", IntegerType(), True),              # Runtime in minutes (Integer)
    StructField("genres", StringType(), True),                # Genres as comma-separated values (String)
    StructField("production_countries", StringType(), True),  # Production countries as comma-separated values (String)
    StructField("seasons", IntegerType(), True),              # Number of seasons (Integer, for series)
    StructField("imdb_id", StringType(), True),               # IMDb ID (String)
    StructField("imdb_score", DecimalType(3, 1), True),       # IMDb score (Decimal with precision 3, scale 1)
    StructField("imdb_votes", IntegerType(), True)            # Number of IMDb votes (Integer)
])
raw_titles_df=spark.read.format("csv").schema(raw_titles_schema).option("header",True).load("/FileStore/tables/raw_titles.csv")
#raw_titles_df.show()



In [0]:
raw_credits_schema=StructType([
    StructField("person_id", IntegerType(), True),  # Unique person ID (Integer)
    StructField("id", StringType(), True),         # Associated content ID (String)
    StructField("name", StringType(), True),       # Person's name (String)
    StructField("character", StringType(), True),  # Character played (String)
    StructField("role", StringType(), True)        # Role (e.g., Actor, Director) (String)
])

raw_credits_df=spark.read.format("csv").schema(raw_credits_schema).option("header",True).load("/FileStore/tables/raw_credits.csv")

In [0]:
best_shows_schema=StructType([
    StructField("title", StringType(), True),               # Title of the media (String)
    StructField("release_year", IntegerType(), True),       # Release year (Integer, used in place of YearType)
    StructField("score", DecimalType(4, 2), True),          # Score (Decimal with precision 4, scale 2)
    StructField("number_of_votes", IntegerType(), True),    # Number of votes (Integer)
    StructField("duration", IntegerType(), True),           # Duration in minutes (Integer)
    StructField("number_of_seasons", IntegerType(), True),  # Number of seasons (Integer, for series)
    StructField("main_genre", StringType(), True),          # Main genre (String)
    StructField("main_production", StringType(), True)      # Main production country (String)
])

best_shows_df=spark.read.format("csv").schema(best_shows_schema).option("header",True).load("/FileStore/tables/Best_Shows_Netflix.csv")

In [0]:
best_show_by_year_schema=StructType([
    StructField("title", StringType(), True),               # Title of the media (String)
    StructField("release_year", IntegerType(), True),       # Release year (Integer, used in place of YearType)
    StructField("score", DecimalType(4, 2), True),          # Score (Decimal with precision 4, scale 2)
    StructField("number_of_seasons", IntegerType(), True),  # Number of seasons (Integer, nullable for movies)
    StructField("main_genre", StringType(), True),          # Main genre (String)
    StructField("main_production", StringType(), True)      # Main production country (String)
])

best_show_by_year_df=spark.read.format("csv").schema(best_show_by_year_schema).option("header",True).load("/FileStore/tables/Best_Show_by_Year_Netflix.csv")

In [0]:
best_movies_schema=StructType([
    StructField("title", StringType(), True),               # Title of the media (String)
    StructField("release_year", IntegerType(), True),       # Release year (Integer, used in place of YearType)
    StructField("score", DecimalType(4, 2), True),          # Score (Decimal with precision 4, scale 2)
    StructField("number_of_votes", IntegerType(), True),    # Number of votes (Integer)
    StructField("duration", IntegerType(), True),           # Duration in minutes (Integer)
    StructField("main_genre", StringType(), True),          # Main genre (String)
    StructField("main_production", StringType(), True)      # Main production country (String)
])
best_movies_df=spark.read.format("csv").schema(best_movies_schema).option("header",True).load("/FileStore/tables/Best_Movies_Netflix.csv")

In [0]:
best_movie_by_year_schema=StructType([
    StructField("title", StringType(), True),               # Title of the media (String)
    StructField("release_year", IntegerType(), True),       # Release year (Integer, used in place of YearType)
    StructField("score", DecimalType(4, 2), True),          # Score (Decimal with precision 4, scale 2)
    StructField("main_genre", StringType(), True),          # Main genre (String)
    StructField("main_production", StringType(), True)      # Main production country (String)
])

best_movie_by_year_df=spark.read.format("csv").schema(best_movie_by_year_schema).option("header",True).load("/FileStore/tables/Best_Movie_by_Year_Netflix.csv")

In [0]:
best_movie_by_year_df=best_movie_by_year_df.dropDuplicates(['release_year'])
# finding top production with best movies and top genre
top_production_and_genre_df=best_movie_by_year_df.groupBy("main_production").count().orderBy(col("count").desc()).select("main_production").limit(1)
top_genre=best_movie_by_year_df.groupBy("main_genre").count().orderBy(col("count").desc()).select("main_genre").limit(1)
top_production_and_genre_df=top_production_and_genre_df.crossJoin(top_genre)

top_production_and_genre_df.show()

+---------------+----------+
|main_production|main_genre|
+---------------+----------+
|             US|     drama|
+---------------+----------+



In [0]:
# checking for duplicates and removing them
window=Window.partitionBy("title","release_year").orderBy("release_year")
best_movies_check_duplicates_df=best_movies_df.withColumn("rn",row_number().over(window))
best_movies_df=best_movies_check_duplicates_df.select("title","release_year","score","number_of_votes","duration","main_genre","main_production").filter(col("rn")==1)
display(best_movies_df.filter(col("release_year")==2016))

title,release_year,score,number_of_votes,duration,main_genre,main_production
13th,2016,8.2,34914,100,documentary,US
A Monster Calls,2016,7.5,86614,108,fantasy,ES
A Silent Voice: The Movie,2016,8.1,75132,130,romance,JP
Amanda Knox,2016,6.9,23969,92,crime,DK
Blue Jay,2016,7.3,17033,81,romance,US
Bo Burnham: Make Happy,2016,8.4,14356,60,comedy,US
Christine,2016,6.9,14977,115,drama,US
Dangal,2016,8.4,180247,161,action,IN
Gantz:O,2016,7.1,14501,95,animation,JP
Hell or High Water,2016,7.6,224900,102,western,US


In [0]:
# movies information
movies=best_movies_df.join(raw_titles_df,on='title',how='left')
movies=movies.select("id","title",best_movies_df["release_year"],"duration","imdb_id","imdb_score","number_of_votes","genres","main_production","age_certification")
movies=movies.filter(col("id").isNotNull())
movies=movies.join(raw_credits_df,on='id',how='inner').select("id","title","release_year","duration","imdb_id","imdb_score","number_of_votes","genres","main_production","age_certification","name","role")

movies_map=movies.groupBy("id", "role").agg(
    concat_ws(", ", collect_list("name")).alias("names")
)
movies_map=movies_map.groupBy("id").agg(map_from_arrays(collect_list("role"),collect_list("names")).alias("cast_&_crew"))
movies=movies.join(movies_map,on='id',how='inner').select("id","title","release_year","duration","imdb_id","imdb_score","number_of_votes","genres","main_production","age_certification","cast_&_crew")
movies=movies.dropDuplicates(["id"])
display(movies.orderBy(col("imdb_score").desc()))



id,title,release_year,duration,imdb_id,imdb_score,number_of_votes,genres,main_production,age_certification,cast_&_crew
tm853783,David Attenborough: A Life on Our Planet,2020,83,tt11989890,9.0,31180,['documentation'],GB,PG,"Map(ACTOR -> Max Hughes, David Attenborough, DIRECTOR -> Alastair Fothergill, Jonathan Hughes, Keith Scholey)"
tm122434,Forrest Gump,1994,142,tt0109830,8.8,1994599,"['drama', 'romance', 'comedy']",US,PG-13,"Map(ACTOR -> Michael Oliver, Jim Hanks, Tyler Long, W. Benson Terry, Bob Harks, Jeffrey Winner, Kurt Russell, Mary Ellen Trainor, Robb Skyler, William Shipman, Brendan Shanahan, Jacqueline Lovell, Aaron Michael Lacey, Zach Hanner, Bryan Hanna, Troy Christian, Greg Brown, Bob Penny, Timothy McNeil, Charles Boswell, Michael Mattison, Hallie D'Amore, Nora Dunfee, Matt Rebenkoff, Lazarus Jackson, Joe Alaskey, Marla Sucharetza, Tiffany Salerno, Dick Cavett, Emily Carey, Vanessa Roth, Geoffrey Blake, Michael Jace, Kevin Davis, Richard D'Alessandro, Isabel Rose, John William Galt, Stephen Bridgewater, Steve DeRelian, Byron Minns, Michael McFall, Steven Griffith, Michael Burgess, Daniel J. Gillooly, John Voldstad, Michael Kemmerling, Mike Jolly, Matt Wallace, Don Fischer, Jed Gillin, Joe Washington, Al Harrington, Mark Matheisen, Kitty K. Green, Marlena Smalls, Kirk Ward, David Brisbin, Daniel C. Striepeke, Brett Rice, Kevin Mangan, Christopher Jones, Margo Moorer, Sam Anderson, George Kelly, Sonny Shroyer, Peter Dobson, Afemo Omilami, Siobhan Fallon Hogan, Haley Joel Osment, Hanna Hall, Michael Conner Humphreys, Sally Field, Mykelti Williamson, Gary Sinise, Robin Wright, Tom Hanks, DIRECTOR -> Robert Zemeckis)"
tm92641,Inception,2010,148,tt1375666,8.8,2268288,"['scifi', 'music', 'thriller', 'action']",GB,PG-13,"Map(ACTOR -> Daniel Girondeaud, Shannon Welles, Lisa Reynolds, Andrew Pleavin, Felix Scott, Michael Gaston, Peter Basham, Nicole Pulliam, Alex Lombard, Jill Maddrell, Carl Gilliard, Natasha Beaumont, Jack Murray, Adam Cole, Mark Fleischmann, Helena Cullinan, Magnus Nolan, Tai-Li Lee, Marc Raducci, Jean-Michel Dagory, Nicolas Clerc, Virgile Bramly, Silvie Laguna, Coralie Dedykere, Tim Kelleher, Russ Fega, Miranda Nolan, Ryan Hayward, Earl Cameron, Yuji Okumoto, Johnathan Geare, Claire Geare, Taylor Geare, Tohoru Masamune, Talulah Riley, Lukas Haas, Michael Caine, Pete Postlethwaite, Marion Cotillard, Tom Berenger, Cillian Murphy, Dileep Rao, Elliot Page, Tom Hardy, Ken Watanabe, Joseph Gordon-Levitt, Leonardo DiCaprio, DIRECTOR -> Christopher Nolan)"
tm1038686,Bo Burnham: Inside,2021,87,tt14545352,8.7,44074,"['comedy', 'drama', 'music', 'reality']",US,R,"Map(ACTOR -> Bo Burnham, DIRECTOR -> Bo Burnham)"
tm129763,Anbe Sivam,2003,160,tt0367495,8.7,20595,"['comedy', 'drama']",IN,,"Map(ACTOR -> Madhan, Krishnamoorthy, Kalairani, Poovilangu Mohan, Benjamin, Nellai Siva, Pasi Sathya, Uma Riyaz Khan, Muthukalai, Ilavarasu, Balu Anand, Vichu Vishwanath, R.S. Shivaji, Yugi Sethu, Seema, Santhana Bharathi, Kiran Rathod, Nassar, R. Madhavan, Kamal Haasan, DIRECTOR -> Sundar C)"
tm57554,Saving Private Ryan,1998,169,tt0120815,8.6,1346020,"['drama', 'war']",US,R,"Map(ACTOR -> Vincent Ventresca, Leo Stransky, Derek Lea, Mac Steinmeier, Nina Muschallik, Thomas Gizbert, Rob Freeman, Kathleen Byron, Harrison Young, Amanda Boxer, Valerie Colgan, Eric Loren, David Wohl, Bryan Cranston, Dale Dye, Harve Presnell, Dorothy Grumbar, John Walters, Nick Brooks, Ryan Hurst, David Vegh, Leland Orser, Nathan Fillion, Anna Maguire, Stephane Cornicard, Stephan Grothgar, Tilo Keiner, Erich Redman, Sam Ellis, Nigel Whitmey, Raffaello Degruttola, Martin Hub, Crofton Hardester, Glenn Wrage, John Sharian, Grahame Wood, Vincent Walsh, Matthew Sharp, Andrew Scott, Lee Aaron Rosen, Mark Phillips, Martin McDougall, Laird Macintosh, Shane Johnson, Paul Hickey, Paschal Friel, Aiden Condron, Maclean Burke, John Barnett, Loclann Aiken, Corey Johnson, Rolf Saxon, Adam Shaw, Ronald Longridge, Seamus McQuade, Paul Garcia, Peter Miles, Neil Finnighan, Markus Napier, Marc Cass, William Marsh, Steve Griffin, Julian Spencer, Gary Sefton, Ian Porter, Demetri Goritsas, Daniel Cerqueira, Dylan Bruno, Max Martini, Joerg Stadler, Dennis Farina, Paul Giamatti, Jeremy Davies, Ted Danson, Giovanni Ribisi, Vin Diesel, Adam Goldberg, Barry Pepper, Matt Damon, Edward Burns, Tom Sizemore, Tom Hanks, DIRECTOR -> Steven Spielberg)"
tm104880,Louis C.K.: Hilarious,2010,84,tt1421373,8.4,11973,['comedy'],US,,"Map(ACTOR -> Louis C.K., DIRECTOR -> Louis C.K.)"
tm142564,3 Idiots,2009,170,tt1187043,8.4,385782,"['comedy', 'drama']",IN,PG-13,"Map(ACTOR -> Arun Bali, Ali Fazal, Javed Jaffrey, Rahul Kumar, Supriya Shukla, Rajeev Ravindranathan, Atul Tiwari, Akhil Mishra, Jayant Kripalani, Chaitali Bose, Achyut Potdar, Olivier Lafont, Mona Singh, Mukund Bhatt, Amardeep Jha, Parikshat Sahni, Farida Dadi, Omi Vaidya, Boman Irani, Kareena Kapoor Khan, R. Madhavan, Sharman Joshi, Aamir Khan, DIRECTOR -> Rajkumar Hirani)"
tm242736,Bo Burnham: Make Happy,2016,60,tt5777636,8.4,14356,"['comedy', 'music', 'documentation']",US,,"Map(ACTOR -> Lorene Scafaria, Bo Burnham, DIRECTOR -> Bo Burnham, Christopher Storer)"
tm245671,Dangal,2016,161,tt5074352,8.4,180247,"['action', 'drama', 'sport']",IN,PG,"Map(ACTOR -> Gurpreet Toti, Shishir Sharma, Ansh Rathore, Meenu Prajapati, Sumit Khanna, Olamilekan Akanbi Jason, Jagbir, Badrul Islam, Ishika Gagneja, Karamveer Choudhary, Anmol Charan, Mahesh Balraj, Anurag Arora, Ravi Aneja, Ritwik Sahore, Vivan Bhatena, Girish Kulkarni, Suhani Bhatnagar, Zaira Wasim, Aparshakti Khurana, Sakshi Tanwar, Sanya Malhotra, Fatima Sana Shaikh, Aamir Khan, DIRECTOR -> Nitesh Tiwari)"


In [0]:
# getting highly apprecited movie genre trends across years

window_genre=Window.partitionBy("release_year","main_genre").orderBy("release_year")
accepted_genere=best_movies_df.withColumn("no_of_best_movies",count("main_genre").over(window_genre)).orderBy("release_year",col("no_of_best_movies").desc())
accepted_genere=accepted_genere.withColumn("rn",row_number().over(window_genre)).filter(col("rn")==1).select("release_year","main_genre")
accepted_genere=accepted_genere.groupBy("release_year").agg(collect_list("main_genre").alias("top_movie_genre_of_the_year")).orderBy("release_year")

year_wise_best_df=best_movie_by_year_df.join(accepted_genere,on="release_year",how="right").orderBy("release_year")


display(year_wise_best_df)



release_year,title,score,main_genre,main_production,top_movie_genre_of_the_year
1954,White Christmas,7.5,romance,US,List(romance)
1961,The Guns of Navarone,7.5,war,US,List(war)
1964,My Fair Lady,7.8,drama,US,List(drama)
1966,,,,,List(western)
1967,Bonnie and Clyde,7.7,drama,US,List(drama)
1971,Dirty Harry,7.7,thriller,US,List(thriller)
1973,The Exorcist,8.1,horror,US,List(horror)
1975,Monty Python and the Holy Grail,8.2,comedy,GB,List(comedy)
1976,Taxi Driver,8.3,crime,US,List(crime)
1979,Life of Brian,8.0,comedy,GB,List(comedy)


In [0]:
# getting highly apprecited show genre trends across years

window_genre_shows=Window.partitionBy("release_year","main_genre").orderBy("release_year")
accepted_genere_shows=best_shows_df.withColumn("no_of_best_shows",count("main_genre").over(window_genre_shows)).orderBy("release_year",col("no_of_best_shows").desc())
accepted_genere_shows=accepted_genere_shows.withColumn("rn",row_number().over(window_genre_shows)).filter(col("rn")==1).select("release_year","main_genre")
accepted_genere_shows=accepted_genere_shows.groupBy("release_year").agg(collect_list("main_genre").alias("top_show_genre_of_the_year")).orderBy("release_year")

year_wise_best_shows_df=best_show_by_year_df.join(accepted_genere_shows,on="release_year",how="right").orderBy("release_year")


display(year_wise_best_shows_df)



release_year,title,score,number_of_seasons,main_genre,main_production,top_show_genre_of_the_year
1969,Monty Python's Flying Circus,8.8,4,comedy,GB,List(comedy)
1989,Seinfeld,8.9,9,comedy,US,List(comedy)
1993,Star Trek: Deep Space Nine,8.1,7,scifi,US,List(scifi)
1995,Neon Genesis Evangelion,8.5,1,scifi,JP,List(scifi)
1997,Stargate SG-1,8.4,10,scifi,US,List(scifi)
1998,Cowboy Bebop,8.9,1,western,JP,List(western)
1999,One Piece,8.8,21,action,JP,List(action)
2000,Gilmore Girls,8.2,8,comedy,US,"List(action, comedy)"
2001,Trailer Park Boys,8.6,12,comedy,CA,List(comedy)
2002,Naruto,8.4,6,scifi,JP,List(scifi)
