In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df=pd.read_csv("Netflix_Movies_Database.csv",lineterminator='\n')
df

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

# Data Preprocessing & Cleaning

### Exploration Summary
* We have a dataframe consisting of 9827 rows and 9 columns.
* Our dataset looks a bit tidy with no NaNs and no duplicated values.
* Release_Date column needs to be casted into date time and to extract only the year value.
* Overview, Original_Language and Poster_URL wouldn't be so useful during analysis, so we'll drop them.
* There is noticeable outliers in Popularity column.
* Vote_Average better be categorised for proper analysis.
* Genre column has comma separated values and white spaces that needs to be handled and casted into category.

### Casting Release Date Column to Datetime type

In [None]:
df['Release_Date']=pd.to_datetime(df['Release_Date'])
print(df['Release_Date'].dtypes)

In [None]:
df['Release_Date']=df['Release_Date'].dt.year
df

### Dropping the Columns

In [None]:
df.drop(columns=['Overview','Original_Language','Poster_Url'],inplace=True)

In [None]:
df

### Categorizing Vote_Average into 4 Categories: `Popular` ,`Average` ,`Below_Avg`, `Not_Popular` for analysis

In [None]:
def categorize_col(df,col,labels):
    edges=[df[col].describe()['min'],
           df[col].describe()['25%'],
           df[col].describe()['50%'],
           df[col].describe()['75%'],
           df[col].describe()['max']]
    df[col]=pd.cut(df[col],edges,labels=labels,duplicates='drop')
    return df

In [None]:
labels=['Not_Popular','Below_Avg','Average','Popular']
categorize_col(df,'Vote_Average',labels)

In [None]:
df['Vote_Average'].unique()

In [None]:
df['Vote_Average'].value_counts()

In [None]:
df.dropna(inplace=True)
df.isna().sum()

### We'd split genres into a list and then explode our dataframe to have only one genre per row for each movie

In [None]:
df['Genre']=df['Genre'].str.split(', ')

In [None]:
df=df.explode("Genre").reset_index(drop=True)

In [None]:
df.head(10)

### Casting Genre Column into category

In [None]:
df['Genre']=df['Genre'].astype("category")

In [None]:
df['Genre'].dtype

In [None]:
df.info()

# Data Analysis

In [None]:
sns.set_style("whitegrid")

### 1. What is the most frequent genre of movies released on Netflix?

In [None]:
df['Genre'].describe()

In [None]:
df['Genre'].value_counts()

In [None]:
sns.catplot(data=df,y='Genre',kind='count',order=df['Genre'].value_counts().index,color='#4287f5')
plt.title("Genre Distribution")
plt.show()

### 2. Which vote category dominates the Netflix movie dataset?

In [None]:
df.head()

In [None]:
sns.catplot(data=df,y='Vote_Average',kind='count',
           order=df['Vote_Average'].value_counts().index,
           color='#4287f5')
plt.title("Votes Distribution")
plt.show()

### 3. What movie got the highest popularity? What's its genre?

In [None]:
df[df['Popularity']==df['Popularity'].max()][['Title','Genre']]

#### Spiderman:No Way Home of Genre Action, Adventure and Science Fiction is the most popular.

### 4. What movie got the lowest popularity? What's its genre?

In [None]:
df[df['Popularity']==df['Popularity'].min()][['Title','Genre']]

#### The United States vs. Billie Holiday & Threads	with genre Music, Drama and History and War, Drama and Science Fiction respectively are the least popular.

### 5. In which year most movies were filmed?

In [None]:
plt.hist(data=df,x='Release_Date',bins=7)
plt.title("Release Date Distribution")
plt.show()

#### 2020 was the year in which most movies were filmed.