# Import the required packages and read the data and know about the general info

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
nd = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
nd.info()

In [None]:
nd.head()

# Data Preprocessing

1. Cleaning directors by replacing with unknown
2. converting date_added to date format
3. splitting actors to list of values
4. Cleaning country by replacing with unkonwn
5. Filling date_added with median data
6. Dropping further nan value data


This approximately gives a total of 8800 rows by eliminating 7 rows from the actual data

In [None]:
nd['director'].fillna('Unknown', inplace=True)

In [None]:
nd['date_added'] = pd.to_datetime(nd['date_added'])

In [None]:
nd['actors'] = nd['cast'].apply(lambda x: x.split(', ') if not pd.isna(x) else [])

In [None]:
nd.drop(columns='cast', inplace=True)

In [None]:
nd.info()

In [None]:
nd['country'].fillna('Unknown', inplace=True)

In [None]:
nd['date_added'].fillna(nd['date_added'].median(), inplace=True)

In [None]:
nd.dropna(inplace=True)

In [None]:
nd.info()

In [None]:
nd.head()

# Analysis on year and rating of movies and shows

1. Comparison between time chart of movie/shows released year and added into netflix
2. Finding the No. of movies/shows ratings by year added into netflix

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
nd['date_added_year'] = nd['date_added'].dt.year

sns.countplot(data=nd, x='date_added_year', hue='type',palette='Set1', ax=axes[0])
axes[0].set_title('Count of Shows/Movies Added by Year')
axes[0].set_xlabel('Year of Addition')
axes[0].set_ylabel('Count')
axes[0].tick_params(rotation=45)

sns.countplot(data=nd[nd['release_year']>=2000], x='release_year', hue='type', palette='Set2', ax=axes[1])
axes[1].set_title('Count of Shows/Movies by Release Year')
axes[1].set_xlabel('Year of Release')
axes[1].set_ylabel('Count')
axes[1].tick_params(rotation=90)

plt.tight_layout()
plt.show()

The above chart gives an overview of how movies/shows released and when they are added into netflix.
We can see that **netflix started to add more movies/shows only after 2011**


In [None]:

heatmap_data = nd[nd['date_added_year']>=2011].pivot_table(index='date_added_year', columns='rating',values='type', aggfunc='count', fill_value=0)
print(heatmap_data)


In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data,  cmap='YlGnBu', cbar=True)
plt.title('No. of movies/shows ratings by year added')
plt.xlabel('Rating')
plt.ylabel('Year added')
plt.show()

From the above heat map we can infer **that TV-MA, TV-14 rated movies/shows are more available.**

# Analysis of Directors

Next we try to look into the director field which is compound field which needs to be exploded
1. Finding the top 10 directors who produced more movies/tv shows
2. Finding the range of rating categories produced by those directors

In [None]:
nd['director'] = nd['director'].str.split(',')
nd_exploded = nd.explode('director')
nd_exploded.info()

In [None]:
nd_exploded['director'] = nd_exploded['director'].apply(lambda x:x.strip())

In [None]:
directors = nd_exploded['director'].value_counts(normalize=True)
print(directors)
directors.nlargest(10).plot(kind='bar')

It is evident that most of the film's director is **unkonwn because of missing data**

In [None]:
dir_list = directors.nlargest(10).keys().to_list()

In [None]:
dir_list

In [None]:
movies_by_dir = nd_exploded[nd_exploded['director'].isin(dir_list)]

In [None]:
mov_pt = movies_by_dir.pivot_table(index='director', columns='rating', values='date_added_year', aggfunc='count', fill_value=0)
mov_pt

This pivot table gives a detailed summary of the top 10 directors movies/shows rating analysis. Majority of their work are falling in the rating category of




*   TV-14
*   TV-G

*   TV-MA
*   TV-PG


*   TV-Y
*   TV-Y7







# Analysis about trend of movies and tv shows in netflix

In [None]:
trend_data = nd.groupby(['date_added_year', 'type']).size().reset_index(name='count')

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=trend_data, x='date_added_year', y='count', hue='type', s=100, palette='Set1')
plt.title('Trend of Movies and TV Shows Year-wise')
plt.xlabel('Added Year')
plt.ylabel('Count')
plt.legend(title='Type', loc='upper left')
plt.show()

`Insights: `

---


In the above chart we can get the movies/shows trending year wise. **Now shows are getting added more**

# Country wise analysis

In [None]:
nd['country'] = nd['country'].str.split(',')
nd_cntry_exp = nd.explode('country')

In [None]:
nd_cntry_exp['country'] = nd_cntry_exp['country'].apply(lambda x: x.strip())


In [None]:
nd_cntry_exp.head()

In [None]:
nd_cntry = nd_cntry_exp.groupby(['country', 'type']).size().reset_index(name='count')

In [None]:
cntry_bar_data = nd_cntry.nlargest(n=10, columns=['count'])
print(cntry_bar_data)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=cntry_bar_data, hue="type", x='country', y='count')

It is clearly seen from the above chart that **US has produced more movies/tv shows compared with other countries**

In [None]:
cntry_data = nd_cntry_exp['country'].value_counts().head(10)

In [None]:
labels = cntry_data.index.to_list()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot( x=cntry_data.index, y=cntry_data.values)
plt.title('Top 10 Countries by Count')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=45)

In [None]:
us_data = nd_cntry_exp[nd_cntry_exp["country"] == "United States"]

In [None]:
us_data = us_data.copy(deep=True)

In [None]:
us_data = us_data.explode('actors')


In [None]:
top_10_us_actors = us_data["actors"].value_counts(normalize=True).head(10).index.to_list()
top_10_us_data = us_data[us_data["actors"].isin(top_10_us_actors)]

In [None]:
top_10_us_actors

In [None]:
us_data_pvt = top_10_us_data.pivot_table(index="actors", columns="type", values="rating", aggfunc="count", fill_value=0)

In [None]:
sns.boxplot(us_data_pvt)

Insights:

---



**US is producing more movies/shows**compared with other countries

# Analysis about actors

In [None]:
nd.info()

In [None]:
act_data = nd.explode('actors')

In [None]:
act_data['actors'].fillna('Unknown', inplace=True)

In [None]:
act_data['actors'] = act_data['actors'].apply(lambda x: x.strip())

In [None]:
act_data.info()

In [None]:
top_10_actors = act_data['actors'].value_counts().head(10).index.to_list()
last_10_actors = act_data['actors'].value_counts().tail(10).index.to_list()

In [None]:
top_10_actors

In [None]:
last_10_actors

In [None]:
top10_act_data = act_data[act_data['actors'].isin(top_10_actors)]
last_10_act_data = act_data[act_data['actors'].isin(last_10_actors)]

In [None]:
grp_data = top10_act_data.groupby(['actors', 'type']).size().reset_index(name='count')
last_grp_data = last_10_act_data.groupby(['actors', 'type']).size().reset_index(name='count')

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=grp_data, x='actors',y='count', hue='type')
plt.xticks(rotation=90)

The above trend shows that **top rated actors are mostly involved in movies** rather than TV shows

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=last_grp_data, x='actors',y='count', hue='type')
plt.xticks(rotation=90)

In [None]:
pivot_table = top10_act_data.pivot_table(index='actors', columns='date_added_year', values='type',aggfunc='count', fill_value=0)

In [None]:
print(pivot_table)

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt='d', linewidths=0.5)
plt.title('Counts of movies done by  Actors by Year')
plt.xlabel('Year')
plt.ylabel('Actors')
plt.xticks(rotation=45)
plt.show()

The above heatmap beautifully show us the trend of top 10 actors timeline


#Analysis of TV shows

In [None]:
tv_show_data = nd[nd['type'] == 'TV Show']
tv_show_data = tv_show_data.copy(deep=True)

In [None]:
show_data = tv_show_data['duration'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
show_data.plot(kind='bar')
plt.xlabel('Duration of TV shows')
plt.ylabel('Count')
plt.title('Count of TV show\'s duration')

TV shows with one season is more available

In [None]:
tv_show_data["listed_in"] = tv_show_data["listed_in"].str.split(",")

In [None]:
tv_show_data = tv_show_data.explode('listed_in')

In [None]:
tv_show_data.info()

In [None]:
tv_show_data["listed_in"] = tv_show_data["listed_in"].apply(lambda x:x.strip())

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=tv_show_data,x="listed_in")
plt.xticks(rotation=90)
plt.title("TV Shows categories")

In [None]:
pivot_tv_show = tv_show_data.pivot_table(index="duration", columns="rating",values="listed_in", aggfunc="count", fill_value=0)

In [None]:
print(pivot_tv_show)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(pivot_tv_show, cmap='YlGnBu', annot=True, fmt='d', linewidths=0.5)

In [None]:
tv_show_data = tv_show_data.explode('actors')

In [None]:
tv_show_data.info()

In [None]:
top_10_show_act = tv_show_data["actors"].value_counts().head(10).index.to_list()

In [None]:
filtered_tv_data = tv_show_data[tv_show_data["actors"].isin(top_10_show_act)]

In [None]:
piv_tab = filtered_tv_data.pivot_table(index="actors", columns="duration", values="rating", aggfunc="count", fill_value=0)

In [None]:
print(piv_tab)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(piv_tab)
plt.title("Top 10 tv show actors duration")

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(piv_tab, cmap="coolwarm", annot=True)
plt.title("Top 10 actors pattern")

# Insights from the above analysis

From the above analysis following insights were made

1. TV shows are getting popularity among Netflix subscribers which can be seen in the movie vs shows trend data

2. Directors are producing more TV-MA-rated movies/shows in the recent years
3. US is producing more TV Shows/Movies when compared with other countries
4. Top-rated actors are choosing movies over TV Shows
5. TV Shows with one season are available for more

But developing countries like India have not invested any ideas in TV Shows. So it will be the right time to produce more shows from developing countries as it is in trend.