In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Let's start to explore Data

#### We are going to examine how netflix been doing so far in terms of Movies and TV shows till 2023

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/netflix-movies-and-tv-shows/netflix_titles.csv', encoding = 'latin1')

In [None]:
df.head()

In [None]:
df.describe()

## Is there missing Data?

In [None]:
df.info()

## Data Visualization Phase

#### We can clearly see that the movie titles found in netflix exceeds number of TV shows. However we have take into account that this is just a content type (TV shows has Episodes therefore they engage viewers for a longer time)






In [None]:
# Distribution of Titles by Type
type_counts = df['type'].value_counts()

plt.figure(figsize=(8, 6))
bars = plt.bar(type_counts.index, type_counts.values, color=['blue', 'green'])
plt.xlabel('Type')
plt.ylabel('Count')
plt.title('Distribution of Titles by Type')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval), va='bottom', ha='center')

plt.show()

#### USA and India is wrecking the chart - We can assume the population demographics of these countries has a greater sentiment towards movies and tv shows.

In [None]:
# Top 10 countries with most productions
top_countries = df['country'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
plt.xlabel('Count')
plt.ylabel('Country')
plt.title('Top 10 Countries with Most Productions')
plt.show()

#### Recently Netflix had been trying to strategize and keep up with emerging platforms such as youtube, which we can assume by the regression in below bars towards 2020. Can they keep up? only time can tell

In [None]:
# Release year distribution
plt.figure(figsize=(10, 6))
plt.hist(df['release_year'], bins=30, color='orange', edgecolor='black')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.title('Release Year Distribution')
plt.show()

#### The entertainment Platform is dominated among TV-MA. However it is also remarkably famous amoung younger coummunity (TV-14). Perhaps this could be a point of debate for viewer's discretion when engaged with the platform.

In [None]:
# Rating distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='rating', palette='magma', order=df['rating'].value_counts().index)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Rating Distribution')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Duration distribution for movies
movie_durations = df[df['type'] == 'Movie']['duration'].str.extract('(\d+)').astype(float)

plt.figure(figsize=(10, 6))
plt.hist(movie_durations, bins=30, color='skyblue', edgecolor='black')
plt.xlabel('Duration (minutes)')
plt.ylabel('Count')
plt.title('Duration Distribution for Movies')
plt.show()

#### TV shows that has around 10 Seasons seems to be a favorite among netflixers. binge worthy TV shows like "friends" are some of the examples that falls to this category.

In [None]:
# Extracting season count from TV shows
tv_shows = df[df['type'] == 'TV Show']
tv_shows['season_count'] = tv_shows['duration'].str.extract('(\d+)').astype(float)

plt.figure(figsize=(10, 6))
sns.barplot(data=tv_shows, x='season_count', y=tv_shows.index, palette='coolwarm')
plt.xlabel('Season Count')
plt.ylabel('Average Duration')
plt.title('Average Duration of TV Shows by Season Count')
plt.show()

#### Comparision of screen directors found under netflix content

In [None]:
# Top 10 directors by title count
top_directors = df['director'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_directors.values, y=top_directors.index, palette='Set2')
plt.xlabel('Count')
plt.ylabel('Director')
plt.title('Top 10 Directors by Title Count')
plt.show()

#### Korean or other bingeworthy Dramas, International movies, Comedy and Action  has been keeping netflixers occupied. However, this is an excellent graph to depict that netflix can also add more content on emerging genre such as International TV shows

In [None]:
from collections import Counter

# Splitting and counting genres
genres = df['listed_in'].apply(lambda x: x.split(','))
genres = [genre.strip() for sublist in genres for genre in sublist]
genre_counts = Counter(genres)

plt.figure(figsize=(8, 12))
plt.barh(list(genre_counts.keys()), list(genre_counts.values()), color='salmon')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.title('Distribution of Genres')
plt.show()