In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("netflix_titles.csv",encoding='latin1')

## Step 1: Understand the Data (Exploration)
    (1) How many total rows and columns are in the dataset?
    (2) What is the data type of each column (e.g., number, text, date)?
    (3) Are there any missing values in any column? If yes, which column has the most missing values?
    (4) Is there more content categorized as "Movie" or "TV Show" on Netflix? Show the count for each.
    (5) Initial Preview: Display the first 10 and last 10 rows of the dataset to see how the data is recorded.
    (6) Unique Values: How many unique categories are there in columns like type, rating, and country?
    (7) Value Counts: What are the counts of each category within the type column (i.e., how many Movies vs. TV
     Shows)? What are the top 10 most frequent values in country, director, and listed_in (genre)?
    (8) Numerical Summary: What are the basic descriptive statistics (mean, median, standard deviation, min, max) for numerical columns like release_year?
    (9) Timespan of Data: What is the earliest and latest release_year in the dataset? What about the date_added?

In [4]:
# 1. How many total rows and columns are in the dataset?
print(f"Total rows = {df.shape[0]} , column = {df.shape[1]}")

Total rows = 8807 , column = 12


In [5]:
# 2. What is the data type of each column (e.g., number, text, date)?
print(df.dtypes)

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [6]:
# 3. Are there any missing values in any column? If yes, which column has the most missing values?

# print(f"Most missing values column = {df.isnull().sum().index[np.argmax(df.isnull().sum().values)]}")

# best way
print(f"Most missing values column = {df.isnull().sum().idxmax()}")

Most missing values column = director


In [7]:
# 4. Is there more content categorized as "Movie" or "TV Show" on Netflix? Show the count for each.
print(f"there more content categorized as = {df.groupby('type')['show_id'].count().idxmax()}")
print(f"count for each = {df.groupby('type')['show_id'].count()}")

there more content categorized as = Movie
count for each = type
Movie      6131
TV Show    2676
Name: show_id, dtype: int64


In [26]:
# 5. Initial Preview: Display the first 10 and last 10 rows of the dataset to see how the data is recorded.
print(f"first 10 = \n {df.head(10)}")
print(f"lst 10 = \n {df.tail(10)}")

first 10 = 
   show_id     type                             title  \
0      s1    Movie              Dick Johnson Is Dead   
1      s2  TV Show                     Blood & Water   
2      s3  TV Show                         Ganglands   
3      s4  TV Show             Jailbirds New Orleans   
4      s5  TV Show                      Kota Factory   
5      s6  TV Show                     Midnight Mass   
6      s7    Movie  My Little Pony: A New Generation   
7      s8    Movie                           Sankofa   
8      s9  TV Show     The Great British Baking Show   
9     s10    Movie                      The Starling   

                         director  \
0                 Kirsten Johnson   
1                             NaN   
2                 Julien Leclercq   
3                             NaN   
4                             NaN   
5                   Mike Flanagan   
6  Robert Cullen, JosÃ© Luis Ucha   
7                    Haile Gerima   
8                 Andy Devonshire   


In [33]:
# 6. Unique Values: How many unique categories are there in columns like type, rating, and country?

# print(f"type = {df['type'].unique().size}")
# print(f"rating = {df['rating'].unique().size}")
# print(f"country = {df['country'].unique().size}")

# Best Way
print(df[['type','rating','country']].nunique())

type         2
rating      17
country    748
dtype: int64


In [46]:
# 7. Value Counts: What are the counts of each category within the type column (i.e., how many Movies vs. TV Shows)? What are the top 10 most frequent values in country, director, and listed_in (genre)?

# print(f"count for each = {df.groupby('type')['show_id'].count()}")
# print(f"country = {df.groupby('country')['show_id'].count().sort_values(ascending=False).head(10)}")
# print(f"director = {df.groupby('director')['show_id'].count().sort_values(ascending=False).head(10)}")
# print(f'listed in = {df.groupby('listed_in')['show_id'].count().sort_values(ascending=False).head(10)}')

# best way
print("--- Type Counts ---")
print(df['type'].value_counts())
print("\n" + "="*30 + "\n") # For clean separation

print("--- Top 10 Countries ---")
print(df['country'].value_counts().head(10))
print("\n" + "="*30 + "\n")

print("--- Top 10 Directors ---")
print(df['director'].value_counts().head(10))
print("\n" + "="*30 + "\n")

print("--- Top 10 Genres (listed_in) ---")
print(df['listed_in'].value_counts().head(10))

--- Type Counts ---
type
Movie      6131
TV Show    2676
Name: count, dtype: int64


--- Top 10 Countries ---
country
United States     2818
India              972
United Kingdom     419
Japan              245
South Korea        199
Canada             181
Spain              145
France             124
Mexico             110
Egypt              106
Name: count, dtype: int64


--- Top 10 Directors ---
director
Rajiv Chilaka              19
RaÃºl Campos, Jan Suter    18
Suhas Kadav                16
Marcus Raboy               16
Jay Karas                  14
Cathy Garcia-Molina        13
Martin Scorsese            12
Youssef Chahine            12
Jay Chapman                12
Steven Spielberg           11
Name: count, dtype: int64


--- Top 10 Genres (listed_in) ---
listed_in
Dramas, International Movies                        362
Documentaries                                       359
Stand-Up Comedy                                     334
Comedies, Dramas, International Movies            

In [48]:
# 8. Numerical Summary: What are the basic descriptive statistics (mean, median, standard deviation, min, max) for numerical columns like release_year?
print(df.describe())

       release_year
count   8807.000000
mean    2014.180198
std        8.819312
min     1925.000000
25%     2013.000000
50%     2017.000000
75%     2019.000000
max     2021.000000


In [58]:
# 9. Timespan of Data: What is the earliest and latest release_year in the dataset? What about the date_added?
# print(df['release_year'].max())
# print(df['release_year'].min())
# df['date_added']=pd.to_datetime(df['date_added'].str.strip())
print(df['date_added'].max())
print(df['date_added'].min())

2021-09-25 00:00:00
2008-01-01 00:00:00



## Step 2: Clean the Data (Cleaning & Preprocessing)
    (1) How would you handle the missing values in the date_added column? Would you remove them or fill them with a
     default value?
    (2) The rating column also has missing values. Fill them with the most frequently occurring rating (the mode).
    (3)The cast and director columns also have null values. For now, you can leave them, but think about how you might
     handle them.

In [13]:
# 1. How would you handle the missing values in the date_added column? Would you remove them or fill them with a default value?

In [14]:
# 2. The rating column also has missing values. Fill them with the most frequently occurring rating (the mode).

In [15]:
# 3. The cast and director columns also have null values. For now, you can leave them, but think about how you might handle them.

## Step 3: Gain Insights from Data (Analysis & Transformation)
    (1) In which year was the most content released? Find the top 5 years using the release_year column.
    (2) In which month was the most content added? (Use the date_added column).
    (3) Who are the top 5 directors with the most movies?
    (4) Who are the top 10 actors who have appeared in the most movies/shows?
    (5) Create a list of all Movies that were produced in India (country = 'India').

In [16]:
# 1. In which year was the most content released? Find the top 5 years using the release_year column.

In [17]:
# 2. In which month was the most content added? (Use the date_added column).

In [18]:
# 3. Who are the top 5 directors with the most movies?

In [19]:
# 4. Who are the top 10 actors who have appeared in the most movies/shows?

In [20]:
# 5. Create a list of all Movies that were produced in India (country = 'India').

## Step 4: Visualize the Data (Plotting) 📊
    (1) Create a bar chart to show the count of Movies vs. TV Shows available on Netflix.
    (2) Create a histogram to show the distribution of release_year. (e.g., how many titles were released in the
     1990s, 2000s, etc.).
    (3) Create a bar chart showing the top 10 countries that produce the most content.
    (4) What is the proportion of different ratings (e.g., TV-MA, TV-14, R)? You can create a pie chart for this.
    (5) Create a line chart that shows the number of titles added each year (release_year). This will give you a trend
     of content growth over time.

In [21]:
# 1. Create a bar chart to show the count of Movies vs. TV Shows available on Netflix.

In [22]:
# 2. Create a histogram to show the distribution of release_year. (e.g., how many titles were released in the 1990s, 2000s, etc.).

In [23]:
# 3. Create a bar chart showing the top 10 countries that produce the most content.

In [24]:
# 4. What is the proportion of different ratings (e.g., TV-MA, TV-14, R)? You can create a pie chart for this.

In [25]:
# 5. Create a line chart that shows the number of titles added each year (release_year). This will give you a trend of content growth over time.