## Import Necessary Libraries / Dependencies

In [1]:
import pandas as pd


## Task 1: Determining what the most popular release year for 'Movies' on Netflix is.

In [2]:
netflix_df = pd.read_csv('/Users/emekaudemezue/Downloads/Data Analytics/Kaggle/Datasets/Raw/netflix_titles.csv')

In [3]:
netflix_df['type'].unique()

array(['Movie', 'TV Show'], dtype=object)

In [4]:
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
# The dataframe contains information on both Movies & TV Shows but for purposes of this 
# task we need just the Movies data so we will need to filter to get the data 

In [5]:
movies_df = netflix_df[netflix_df['type']=='Movie']

In [6]:
movies_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...


In [7]:
movies_df['count'] = 1
release_year_count = movies_df.groupby(['release_year']).sum().reset_index()[['release_year','count']]
release_year_count.sort_values(['count'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['count'] = 1


Unnamed: 0,release_year,count
69,2018,767
68,2017,767
67,2016,658
70,2019,633
71,2020,517
...,...,...
12,1961,1
14,1963,1
17,1966,1
5,1947,1


## Task 2: Determining year with most added content on the Netflix platform

In [None]:
# Due to possible variations in the 'date_added' column and the fact that the column's 
#    property is 'object'. The column would need to be converted to the 'datetime' format and
#     then a new column created to hold just the year as we dont want to alter or change the 
#     information in the original 'date_added' column.

In [8]:
netflix_df['new_date'] = pd.to_datetime(netflix_df['date_added'])

netflix_df['date_added_year'] = netflix_df['new_date'].dt.year

netflix_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,new_date,date_added_year
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021-09-25,2021.0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021-09-24,2021.0


In [9]:
netflix_df['count'] = 1
date_added_count = netflix_df.groupby(['date_added_year']).sum().reset_index()[['date_added_year','count']]
date_added_count.sort_values(['count'], ascending=False)

Unnamed: 0,date_added_year,count
11,2019.0,2016
12,2020.0,1879
10,2018.0,1649
13,2021.0,1498
9,2017.0,1188
8,2016.0,429
7,2015.0,82
6,2014.0,24
3,2011.0,13
5,2013.0,11


##  Task 3: Aside 'Year', what is the most popular month to add new content?

In [10]:
netflix_df['date_added_month'] = netflix_df['new_date'].dt.month
netflix_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,new_date,date_added_year,count,date_added_month
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021-09-25,2021.0,1,9.0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021-09-24,2021.0,1,9.0


In [11]:
netflix_df['count'] = 1
month_added_count = netflix_df.groupby(['date_added_month']).sum().reset_index()[['date_added_month','count']]
month_added_count.sort_values(['count'], ascending=False)

Unnamed: 0,date_added_month,count
6,7.0,827
11,12.0,813
8,9.0,770
3,4.0,764
9,10.0,760
7,8.0,755
2,3.0,742
0,1.0,738
5,6.0,728
10,11.0,705


## Task 4: Movie with longest title in the dataset

In [12]:
movies_df = netflix_df[netflix_df['type']=='Movie']

In [13]:
movies_df['moviestitle_length'] = [len(title) for title in movies_df['title']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['moviestitle_length'] = [len(title) for title in movies_df['title']]


In [14]:
top_title_lengths = movies_df.sort_values(['moviestitle_length'], ascending=False)
top_title_lengths.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,new_date,date_added_year,count,date_added_month,moviestitle_length
5164,s5165,Movie,Jim & Andy: The Great Beyond - Featuring a Ver...,Chris Smith,Jim Carrey,"Canada, United States","November 17, 2017",2017,TV-MA,94 min,Documentaries,Through the lens of his stunningly immersive p...,2017-11-17,2017.0,1,11.0,104
8462,s8463,Movie,The Power of Grayskull: The Definitive History...,"Randall Lobb, Robert McCallum",,"Canada, United States","August 24, 2018",2017,TV-14,96 min,Documentaries,This colorful documentary explores the origins...,2018-08-24,2018.0,1,8.0,88
3240,s3241,Movie,Mike Birbiglia: What I Should Have Said Was No...,,Mike Birbiglia,United States,"November 22, 2019",2008,TV-MA,56 min,Stand-Up Comedy,Mike Birbiglia performs in this live concert s...,2019-11-22,2019.0,1,11.0,88
4851,s4852,Movie,Steve Martin and Martin Short: An Evening You ...,Marcus Raboy,"Steve Martin, Martin Short",United States,"May 25, 2018",2018,TV-14,74 min,Stand-Up Comedy,Longtime pals and showbiz icons Steve Martin a...,2018-05-25,2018.0,1,5.0,83
3479,s3480,Movie,Cultivating the Seas: History and Future of th...,Mana Yasuda,,,"September 28, 2019",2019,TV-PG,45 min,"Documentaries, International Movies",An in-depth look at the full-cycle breeding pr...,2019-09-28,2019.0,1,9.0,79


In [15]:
top_title_lengths.iloc[0]['title']

'Jim & Andy: The Great Beyond - Featuring a Very Special, Contractually Obligated Mention of Tony Clifton'

## Task 5: Actor/Actress with the most appearances in the Netflix Database

In [16]:
pd.options.display.max_colwidth = 400

# To display information that was hidden in column due to pandas column width restriction of '58'

In [17]:
title_and_cast_df = netflix_df[['title','cast']]
title_and_cast_df = title_and_cast_df.dropna() 
# titles with a 'NaN' result for 'cast' are dropped

In [28]:
cast_names_counter = {}

for index, row in title_and_cast_df.iterrows():
    movie_cast = row['cast']
    movie_cast_split = movie_cast.split(',')
    movie_cast_stripped = [name.strip().upper() for name in movie_cast_split] 
#   'Strip' method used to stripped extra white spaces for result to allow for truer count result
#     'Upper' method used to make results more uniform

    for name in movie_cast_stripped:
        cast_names_counter[name] = cast_names_counter.get(name, 0) + 1

dict(sorted(cast_names_counter.items(), key=lambda item: item[1], reverse = True))

{'ANUPAM KHER': 43,
 'SHAH RUKH KHAN': 35,
 'JULIE TEJWANI': 33,
 'NASEERUDDIN SHAH': 32,
 'TAKAHIRO SAKURAI': 32,
 'RUPA BHIMANI': 31,
 'AKSHAY KUMAR': 30,
 'OM PURI': 30,
 'YUKI KAJI': 29,
 'AMITABH BACHCHAN': 28,
 'PARESH RAWAL': 28,
 'BOMAN IRANI': 27,
 'RAJESH KAVA': 26,
 'VINCENT TONG': 26,
 'ANDREA LIBMAN': 25,
 'KAREENA KAPOOR': 25,
 'SAMUEL L. JACKSON': 24,
 'JOHN CLEESE': 24,
 'JIGNA BHARDWAJ': 23,
 'FRED TATASCIORE': 23,
 'TARA STRONG': 23,
 'DAISUKE ONO': 22,
 'JUNICHI SUWABE': 21,
 'AJAY DEVGN': 21,
 'ASHLEIGH BALL': 21,
 'KAY KAY MENON': 21,
 'NICOLAS CAGE': 21,
 'NAWAZUDDIN SIDDIQUI': 21,
 'ADAM SANDLER': 20,
 'DAVID ATTENBOROUGH': 20,
 'SALMAN KHAN': 20,
 'DAVID SPADE': 19,
 'YUICHI NAKAMURA': 19,
 'ERIN FITZGERALD': 19,
 'MAMORU MIYANO': 19,
 'MORGAN FREEMAN': 19,
 'JAMES FRANCO': 19,
 'SETH ROGEN': 19,
 'AI KAYANO': 19,
 'FRED ARMISEN': 19,
 'GULSHAN GROVER': 19,
 'VATSAL DUBEY': 18,
 'RAJESH SHARMA': 18,
 'FORTUNE FEIMSTER': 18,
 'DIANA KAARINA': 18,
 'YASHPAL SHARMA