In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [56]:
df=pd.read_csv("netflix_titles.csv",encoding='latin1')

## Step 1: Understand the Data (Exploration)
    (1) How many total rows and columns are in the dataset?
    (2) What is the data type of each column (e.g., number, text, date)?
    (3) Are there any missing values in any column? If yes, which column has the most missing values?
    (4) Is there more content categorized as "Movie" or "TV Show" on Netflix? Show the count for each.
    (5) Initial Preview: Display the first 10 and last 10 rows of the dataset to see how the data is recorded.
    (6) Unique Values: How many unique categories are there in columns like type, rating, and country?
    (7) Value Counts: What are the counts of each category within the type column (i.e., how many Movies vs. TV
     Shows)? What are the top 10 most frequent values in country, director, and listed_in (genre)?
    (8) Numerical Summary: What are the basic descriptive statistics (mean, median, standard deviation, min, max) for numerical columns like release_year?
    (9) Timespan of Data: What is the earliest and latest release_year in the dataset? What about the date_added?

In [57]:
# 1. How many total rows and columns are in the dataset?
print(f"Total rows = {df.shape[0]} , column = {df.shape[1]}")

Total rows = 8807 , column = 12


In [58]:
# 2. What is the data type of each column (e.g., number, text, date)?
print(df.dtypes)

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [59]:
# 3. Are there any missing values in any column? If yes, which column has the most missing values?

# print(f"Most missing values column = {df.isnull().sum().index[np.argmax(df.isnull().sum().values)]}")

# best way
print(f"Most missing values column = {df.isnull().sum().idxmax()}")

Most missing values column = director


In [60]:
# 4. Is there more content categorized as "Movie" or "TV Show" on Netflix? Show the count for each.
print(f"there more content categorized as = {df.groupby('type')['show_id'].count().idxmax()}")
print(f"count for each = {df.groupby('type')['show_id'].count()}")

there more content categorized as = Movie
count for each = type
Movie      6131
TV Show    2676
Name: show_id, dtype: int64


In [61]:
# 5. Initial Preview: Display the first 10 and last 10 rows of the dataset to see how the data is recorded.
print(f"first 10 = \n {df.head(10)}")
print(f"lst 10 = \n {df.tail(10)}")

first 10 = 
   show_id     type                             title  \
0      s1    Movie              Dick Johnson Is Dead   
1      s2  TV Show                     Blood & Water   
2      s3  TV Show                         Ganglands   
3      s4  TV Show             Jailbirds New Orleans   
4      s5  TV Show                      Kota Factory   
5      s6  TV Show                     Midnight Mass   
6      s7    Movie  My Little Pony: A New Generation   
7      s8    Movie                           Sankofa   
8      s9  TV Show     The Great British Baking Show   
9     s10    Movie                      The Starling   

                         director  \
0                 Kirsten Johnson   
1                             NaN   
2                 Julien Leclercq   
3                             NaN   
4                             NaN   
5                   Mike Flanagan   
6  Robert Cullen, JosÃ© Luis Ucha   
7                    Haile Gerima   
8                 Andy Devonshire   


In [62]:
# 6. Unique Values: How many unique categories are there in columns like type, rating, and country?

# print(f"type = {df['type'].unique().size}")
# print(f"rating = {df['rating'].unique().size}")
# print(f"country = {df['country'].unique().size}")

# Best Way
print(df[['type','rating','country']].nunique())

type         2
rating      17
country    748
dtype: int64


In [63]:
# 7. Value Counts: What are the counts of each category within the type column (i.e., how many Movies vs. TV Shows)? What are the top 10 most frequent values in country, director, and listed_in (genre)?

# print(f"count for each = {df.groupby('type')['show_id'].count()}")
# print(f"country = {df.groupby('country')['show_id'].count().sort_values(ascending=False).head(10)}")
# print(f"director = {df.groupby('director')['show_id'].count().sort_values(ascending=False).head(10)}")
# print(f'listed in = {df.groupby('listed_in')['show_id'].count().sort_values(ascending=False).head(10)}')

# best way
print("--- Type Counts ---")
print(df['type'].value_counts())
print("\n" + "="*30 + "\n") # For clean separation

print("--- Top 10 Countries ---")
print(df['country'].value_counts().head(10))
print("\n" + "="*30 + "\n")

print("--- Top 10 Directors ---")
print(df['director'].value_counts().head(10))
print("\n" + "="*30 + "\n")

print("--- Top 10 Genres (listed_in) ---")
print(df['listed_in'].value_counts().head(10))

--- Type Counts ---
type
Movie      6131
TV Show    2676
Name: count, dtype: int64


--- Top 10 Countries ---
country
United States     2818
India              972
United Kingdom     419
Japan              245
South Korea        199
Canada             181
Spain              145
France             124
Mexico             110
Egypt              106
Name: count, dtype: int64


--- Top 10 Directors ---
director
Rajiv Chilaka              19
RaÃºl Campos, Jan Suter    18
Suhas Kadav                16
Marcus Raboy               16
Jay Karas                  14
Cathy Garcia-Molina        13
Martin Scorsese            12
Youssef Chahine            12
Jay Chapman                12
Steven Spielberg           11
Name: count, dtype: int64


--- Top 10 Genres (listed_in) ---
listed_in
Dramas, International Movies                        362
Documentaries                                       359
Stand-Up Comedy                                     334
Comedies, Dramas, International Movies            

In [64]:
# 8. Numerical Summary: What are the basic descriptive statistics (mean, median, standard deviation, min, max) for numerical columns like release_year?
print(df.describe())

       release_year
count   8807.000000
mean    2014.180198
std        8.819312
min     1925.000000
25%     2013.000000
50%     2017.000000
75%     2019.000000
max     2021.000000


In [65]:
# 9. Timespan of Data: What is the earliest and latest release_year in the dataset? What about the date_added?
# print(df['release_year'].max())
# print(df['release_year'].min())
df['date_added']=pd.to_datetime(df['date_added'].str.strip())
print(df['date_added'].max())
print(df['date_added'].min())

2021-09-25 00:00:00
2008-01-01 00:00:00



## Step 2: Clean the Data (Cleaning & Preprocessing)
    (1) How would you handle the missing values in the date_added column? Would you remove them or fill them with a
     default value?
    (2) The rating column also has missing values. Fill them with the most frequently occurring rating (the mode).
    (3)The cast and director columns also have null values. For now, you can leave them, but think about how you might
     handle them.
    (4) Handling Missing Directors/Cast/Country: Decide on a strategy for the missing values in director, cast, and country. Should you fill them
     with a placeholder like "Unknown" or drop the rows? Justify your choice.
    (5) Fixing Missing Ratings: The rating column has missing values. How would you fill them? A good approach is to use the mode (the most
     frequently occurring rating).
    (6) Managing Missing Dates: Address the missing values in date_added. Is it better to drop these few rows or fill them with the release_year?
    (7) Data Type Conversion: The date_added column is currently a string (object). Convert it to a proper datetime format so you can perform
    time-based analysis.
    (8) Cleaning the Duration Column: The duration column contains mixed data (e.g., "100 min" for movies, "2 Seasons" for TV shows). Separate the
     dataset into two: one for Movies and one for TV Shows. For the Movie dataset, convert the duration column to a numerical type (integer) by
      removing "min". Do the same for the TV Show dataset, converting seasons to an integer.
    (9) Handling Duplicates: Check if there are any duplicate rows in the dataset and remove them if they exist.
    (10) Standardizing Genres: The listed_in column often contains multiple genres in one string (e.g., "Dramas, International Movies"). Should you
     split these into a list or just use the primary genre for analysis?

In [66]:
# 1. How would you handle the missing values in the date_added column? Would you remove them or fill them with a default value?
df.dropna(subset=['date_added'],inplace=True)

In [67]:
# 2. The rating column also has missing values. Fill them with the most frequently occurring rating (the mode).
df['rating'].fillna(df['rating'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mode()[0],inplace=True)


In [68]:
# 3. The cast and director columns also have null values. For now, you can leave them, but think about how you might handle them.
df.info()
'''
Strategy 1: Fill with a Placeholder (Jagya Purvi)
    - This is the most common and safest approach for text columns like cast and director.
    - What to do: Instead of leaving the cells blank, we can fill them with a placeholder word like "Unknown", "Not Available", or "No Director".
    - Why it's a good idea:
        - No Data Loss: We don't have to delete the entire row. We can still use the information from other columns like release_year, country, and rating for our analysis.
        - Clarity: It clearly tells us that the information was missing, not that the movie had no director or cast.
        - Easy Analysis: We can later easily count how many movies have an "Unknown" director or filter them out if needed.

Strategy 2: Do Nothing (Kaai Na Karvu)
    - Sometimes, the best action is no action.
    - What to do: Leave the missing values as they are (NaN).
    - Why it might work:
        - Specific Analysis: If you are only analyzing trends by country or release year, the director and cast columns don't matter. Pandas is smart enough to ignore missing values during many calculations (like counting).
        - Honesty: It keeps the data in its original, raw state.
    - The downside: This can cause errors or unexpected results in certain analyses, especially if you try to search for a specific director's name.

Strategy 3: Deleting Rows (Rows Kadhi Nakhvi)
    - This is a more aggressive approach and generally not recommended for these columns.
    - What to do: Delete any row where the director or cast information is missing.
    - Why it's risky:
        - Major Data Loss: The director column has thousands of missing values. Deleting all those rows would mean throwing away a large portion of our dataset (almost 30%). This would make our overall analysis much less reliable.
'''

<class 'pandas.core.frame.DataFrame'>
Index: 8797 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8797 non-null   object        
 1   type          8797 non-null   object        
 2   title         8797 non-null   object        
 3   director      6173 non-null   object        
 4   cast          7972 non-null   object        
 5   country       7967 non-null   object        
 6   date_added    8797 non-null   datetime64[ns]
 7   release_year  8797 non-null   int64         
 8   rating        8797 non-null   object        
 9   duration      8794 non-null   object        
 10  listed_in     8797 non-null   object        
 11  description   8797 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 893.4+ KB


'\nStrategy 1: Fill with a Placeholder (Jagya Purvi)\n    - This is the most common and safest approach for text columns like cast and director.\n    - What to do: Instead of leaving the cells blank, we can fill them with a placeholder word like "Unknown", "Not Available", or "No Director".\n    - Why it\'s a good idea:\n        - No Data Loss: We don\'t have to delete the entire row. We can still use the information from other columns like release_year, country, and rating for our analysis.\n        - Clarity: It clearly tells us that the information was missing, not that the movie had no director or cast.\n        - Easy Analysis: We can later easily count how many movies have an "Unknown" director or filter them out if needed.\n\nStrategy 2: Do Nothing (Kaai Na Karvu)\n    - Sometimes, the best action is no action.\n    - What to do: Leave the missing values as they are (NaN).\n    - Why it might work:\n        - Specific Analysis: If you are only analyzing trends by country or rele

In [69]:
# 4. Handling Missing Directors/Cast/Country: Decide on a strategy for the missing values in director, cast, and country. Should you fill them with a placeholder like "Unknown" or drop the rows? Justify your choice.
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('Unknown', inplace=True)


In [70]:
# 5. Fixing Missing Ratings: The rating column has missing values. How would you fill them? A good approach is to use the mode (the most frequently occurring rating).
# Already done in step 2

In [71]:
# 6. Managing Missing Dates: Address the missing values in date_added. Is it better to drop these few rows or fill them with the release_year?
# Already done in step 1

In [72]:
# 7. Data Type Conversion: The date_added column is currently a string (object). Convert it to a proper datetime format so you can perform time-based analysis.
# Already done in exploration step 9

In [73]:
# 8. Cleaning the Duration Column: The duration column contains mixed data (e.g., "100 min" for movies, "2 Seasons" for TV shows). Separate the dataset into two: one for Movies and one for TV Shows. For the Movie dataset, convert the duration column to a numerical type (integer) by removing "min". Do the same for the TV Show dataset, converting seasons to an integer.
df['duration']=df['duration'].str.replace('min','',regex=False)
mask=df['type']=='Movie'
df['movie_duration']=pd.to_numeric(df.loc[mask,'duration'])

df['duration']=df['duration'].str.replace('s','',regex=False)
df['duration']=df['duration'].str.replace('Season','',regex=False)
mask1=df['type']=='TV Show'
df['tv_show_duration']=pd.to_numeric(df.loc[mask1,'duration'])

ValueError: Unable to parse string "2 Seaon" at position 0

In [41]:
# 9. Handling Duplicates: Check if there are any duplicate rows in the dataset and remove them if they exist.
df[df.duplicated()] # not available

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,movie_duration


In [82]:
# 10. Standardizing Genres: The listed_in column often contains multiple genres in one string (e.g., "Dramas, International Movies"). Should you split these into a list or just use the primary genre for analysis?
df['listed_in']=df['listed_in'].str.split(',')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,movie_duration
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90,[Documentaries],"As her father nears the end of his life, filmm...",90.0
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seaon,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t...",
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Seaon,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...,
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,,2021-09-24,2021,TV-MA,1 Seaon,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo...",
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seaon,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158,"[Cult Movies, Dramas, Thrillers]","A political cartoonist, a crime reporter and a...",158.0
8803,s8804,TV Show,Zombie Dumb,Unknown,Unknown,,2019-07-01,2018,TV-Y7,2 Seaon,"[Kids' TV, Korean TV Shows, TV Comedies]","While living alone in a spooky town, a young g...",
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88,"[Comedies, Horror Movies]",Looking to survive in a world taken over by zo...,88.0
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88,"[Children & Family Movies, Comedies]","Dragged from civilian life, a former superhero...",88.0


## Step 3: Gain Insights from Data (Analysis & Transformation)
    (1) In which year was the most content released? Find the top 5 years using the release_year column.
    (2) In which month was the most content added? (Use the date_added column).
    (3) Who are the top 5 directors with the most movies?
    (4) Who are the top 10 actors who have appeared in the most movies/shows?
    (5) Create a list of all Movies that were produced in India (country = 'India').
    (6) Feature Engineering (Time): From the date_added (datetime) column, create new columns for added_year, added_month,
     and added_day_of_week.
    (7) Old vs. New Content: What is the average time lag between the release_year and the date_added? This tells you how
     old content generally is when it arrives on Netflix.
    (8) Content by Decade: Create a new column called release_decade (e.g., 1990s, 2000s, 2010s). Which decade has produced
     the most content?
    (9) Longest Movies: After cleaning the duration column, find the top 10 longest movies on Netflix.
    (10) Most Seasons in TV Shows: Find the top 10 TV shows with the most seasons.
    (11) Director-Genre Specialization: Find the top director for the "Dramas" genre. Who is the top director for "Comedies"?
    (12) Actor-Director Collaboration: Which actor and director pair has collaborated the most?
    (13) Country-Specific Analysis: What is the most common genre for content produced in 'India'? What about in the 'United
     States'?
    (14) Content Targeting: Which rating is most common for Movies? Is it different for TV Shows?
    (15) Text Analysis: Analyze the description column. What are the most common words found in the descriptions of 'Horror'
     Movies'?

In [16]:
# 1. In which year was the most content released? Find the top 5 years using the release_year column.

In [17]:
# 2. In which month was the most content added? (Use the date_added column).

In [18]:
# 3. Who are the top 5 directors with the most movies?

In [19]:
# 4. Who are the top 10 actors who have appeared in the most movies/shows?

In [20]:
# 5. Create a list of all Movies that were produced in India (country = 'India').

In [None]:
# 6. Feature Engineering (Time): From the date_added (datetime) column, create new columns for added_year, added_month, and added_day_of_week.

In [None]:
# 7. Old vs. New Content: What is the average time lag between the release_year and the date_added? This tells you how old content generally is when it arrives on Netflix.

In [None]:
# 8. Content by Decade: Create a new column called release_decade (e.g., 1990s, 2000s, 2010s). Which decade has produced the most content?

In [None]:
# 9. Longest Movies: After cleaning the duration column, find the top 10 longest movies on Netflix.

In [None]:
# 10. Most Seasons in TV Shows: Find the top 10 TV shows with the most seasons.

In [None]:
# 11. Director-Genre Specialization: Find the top director for the "Dramas" genre. Who is the top director for "Comedies"?

In [None]:
# 12. Actor-Director Collaboration: Which actor and director pair has collaborated the most?

In [None]:
# 13. Country-Specific Analysis: What is the most common genre for content produced in 'India'? What about in the 'United States'?

In [None]:
# 14. Content Targeting: Which rating is most common for Movies? Is it different for TV Shows?

In [None]:
# 15. Text Analysis: Analyze the description column. What are the most common words found in the descriptions of 'Horror Movies'?

## Step 4: Visualize the Data (Plotting) 📊
    (1) Create a bar chart to show the count of Movies vs. TV Shows available on Netflix.
    (2) Create a histogram to show the distribution of release_year. (e.g., how many titles were released in the
     1990s, 2000s, etc.).
    (3) Create a bar chart showing the top 10 countries that produce the most content.
    (4) What is the proportion of different ratings (e.g., TV-MA, TV-14, R)? You can create a pie chart for this.
    (5) Create a line chart that shows the number of titles added each year (release_year). This will give you a trend
     of content growth over time.

     Comparisons & Rankings (Bar Charts)
        (11) Content Type Distribution: Create a bar chart comparing the total number of Movies vs. TV Shows.
        (6) Top Content-Producing Countries: Visualize the top 10 countries with the most content using a horizontal bar
         chart.
        (7) Most Common Genres: Create a bar chart showing the top 15 most common genres.
        (8) Peak Addition Months: Create a bar chart showing the number of titles added per month. Which are the busiest
         months?
        (9) Audience Ratings Distribution: Use a bar chart to show the count of content across different ratings.
        (10) Movies vs. TV Shows by Country: Create a stacked bar chart for the top 10 countries, showing the proportion
         of Movies vs. TV Shows within each country.

    Trends Over Time (Line Charts)
        (12) Content Growth Over Years: Plot a line chart showing the number of titles added to Netflix each year.
        (13) Separate Growth Trends: On a single graph, plot two lines: one for the growth of Movies and one for TV Shows
         added over time.
        (14) Cumulative Content Growth: Create a line chart showing the cumulative sum of content added over the years.

    Distributions (Histograms & Box Plots)
        (15) Release Year Distribution: Create a histogram to understand the distribution of content based on
         release_year.
        (16) Movie Duration Analysis: Create a histogram to show the distribution of movie durations (in minutes).
        (17) TV Show Season Analysis: Create a histogram to see the distribution of the number of seasons for TV shows.
        (18) Duration Outliers: Use a box plot to visualize the spread of movie durations and identify outliers.
        (19) Duration by Rating: Create box plots to compare movie durations across different ratings (e.g., 'R',
         'PG-13').

    Proportions & Text (Pie Charts & Word Clouds)
        (20) Content Type Share: Create a pie chart to show the percentage share of Movies vs. TV Shows.
        (21) Common Genre Keywords: Create a word cloud from the listed_in column to visually represent the most frequent
         genre tags.

In [21]:
# 1. Create a bar chart to show the count of Movies vs. TV Shows available on Netflix.

In [22]:
# 2. Create a histogram to show the distribution of release_year. (e.g., how many titles were released in the 1990s, 2000s, etc.).

In [23]:
# 3. Create a bar chart showing the top 10 countries that produce the most content.

In [24]:
# 4. What is the proportion of different ratings (e.g., TV-MA, TV-14, R)? You can create a pie chart for this.

In [25]:
# 5. Create a line chart that shows the number of titles added each year (release_year). This will give you a trend of content growth over time.

In [None]:
# 6. Top Content-Producing Countries: Visualize the top 10 countries with the most content using a horizontal bar chart.

In [None]:
# 7. Most Common Genres: Create a bar chart showing the top 15 most common

In [None]:
# 8. Peak Addition Months: Create a bar chart showing the number of titles added per month. Which are the busiest months?

In [None]:
# 9. Audience Ratings Distribution: Use a bar chart to show the count of content across different ratings.

In [None]:
# 10. Movies vs. TV Shows by Country: Create a stacked bar chart for the top 10 countries, showing the proportion of Movies vs. TV Shows within each country.

In [None]:
# 11. Content Type Distribution: Create a bar chart comparing the total number of Movies vs. TV Shows.

In [None]:
# 12. Content Growth Over Years: Plot a line chart showing the number of titles added to Netflix each year.

In [None]:
# 13. Separate Growth Trends: On a single graph, plot two lines: one for the growth of Movies and one for TV Shows added over time.

In [None]:
# 14. Cumulative Content Growth: Create a line chart showing the cumulative sum of content added over the years.

In [None]:
# 15. Release Year Distribution: Create a histogram to understand the distribution of content based on release_year.

In [None]:
# 16. Movie Duration Analysis: Create a histogram to show the distribution of movie durations (in minutes).

In [None]:
# 17. TV Show Season Analysis: Create a histogram to see the distribution of the number of seasons for TV shows.

In [None]:
# 18. Duration Outliers: Use a box plot to visualize the spread of movie durations and identify outliers.

In [None]:
# 19. Duration by Rating: Create box plots to compare movie durations across different ratings (e.g., 'R', 'PG-13').

In [None]:
# 20. Content Type Share: Create a pie chart to show the percentage share of Movies vs. TV Shows.