In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

import plotly.offline as pyo

pyo.init_notebook_mode()

import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'notebook_connected'

%matplotlib inline

Importing Data

In [None]:
df = pd.read_csv('D:\Coding\Fuul Stack Data Scientist\Project 1 - Top Movie Streaming\moviestreams.csv')

df.head()

In [None]:
cols = df.columns.to_list()

cols

Cleaning Data

1. Dropping unnecessary columns

In [None]:
df.drop(['Unnamed: 0', 'ID'], axis=1, inplace=True)

2. Checking missing values

In [None]:
df.isna().sum()

In [None]:
#removing '+' from 'Age' column
df['Age Group'] = df['Age'].str.replace('+', '')

#remove 'all' from 'Age' column
df['Age Group'] = df['Age Group'].str.lower().replace('all', '0')

df['Age Group'] = df['Age Group'].astype(float)

df['Age Group']

In [None]:
df['New_Rotten_Tomatoes'] = df['Rotten Tomatoes'].str.replace('%', '')

df['New_Rotten_Tomatoes'] = df['New_Rotten_Tomatoes'].astype(float)

df['New_Rotten_Tomatoes']

Visualisations

1. Number of movies for each Age Group

In [None]:
df['Age Group'].value_counts()

2. Top 10 languages in Streaming Services

In [None]:
df.Language.value_counts()

In [None]:
language = df.Language.value_counts().head(10)

plt.figure(figsize=(15, 8))

plt.title('Top 10 Languages in Streaming Services')

sns.barplot(x=language.index, y=language.values)

In [None]:
fig = px.pie(df,
    values=language.values,
    names=language.index,
    title='Top 10 Languages in Streaming Services',
    height=600)

fig.show()

3. Number of Movies in specific age group in All services

In [None]:
fig = px.bar(df,
    y=df['Age'].value_counts(),
    x=df['Age'].value_counts().index,
    title='Number of Movies in specific age group in All services',
    text=df['Age'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

4. Number of Movies in specific age group in Netflix

In [None]:
netflix_df = df[df['Netflix']==1]

fig = px.bar(netflix_df,
    y=netflix_df['Age'].value_counts(),
    x=netflix_df['Age'].value_counts().index,
    title='Number of Movies in specific age group in Netflix services',
    text=netflix_df['Age'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

5. Number of Movies in specific age group in Prime Video

In [None]:
prime_df = df[df['Prime Video']==1]

fig = px.bar(prime_df,
    y=prime_df['Age'].value_counts(),
    x=prime_df['Age'].value_counts().index,
    title='Number of Movies in specific age group in Prime Video services',
    text=prime_df['Age'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

6. Number of Movies in specific age group in Disney+

In [None]:
disney_df = df[df['Disney+']==1]

fig = px.bar(disney_df,
    y=disney_df['Age'].value_counts(),
    x=disney_df['Age'].value_counts().index,
    title='Number of Movies in specific age group in Prime Video services',
    text=disney_df['Age'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

7. Number of Movies in specific age group in Hulu

In [None]:
hulu_df = df[df['Hulu']==1]

fig = px.bar(hulu_df,
    y=hulu_df['Age'].value_counts(),
    x=hulu_df['Age'].value_counts().index,
    title='Number of Movies in specific age group in Prime Video services',
    text=hulu_df['Age'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

7. Overall Rotten Tomato ratings for all services

In [None]:
fig = px.bar(df,
    x=df['Rotten Tomatoes'].value_counts().index,
    y=df['Rotten Tomatoes'].value_counts(),
    title='Overall Rotten Tomato Ratings',
    text=df['Rotten Tomatoes'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

8. Rotten Tomato ratings for each service

In [None]:
netflix_df['Rotten Tomatoes'].value_counts().sum()

In [None]:
rt_scores = pd.DataFrame({'Streaming Service': ['Netflix','Prime Video', 'Disney+', 'Hulu'],
'Rotten Tomatoes Score': [  netflix_df['Rotten Tomatoes'].value_counts()[0],
                            prime_df['Rotten Tomatoes'].value_counts()[0],
                            disney_df['Rotten Tomatoes'].value_counts()[0],
                            hulu_df['Rotten Tomatoes'].value_counts()[0]
]})

rt_scores.head()

In [None]:
fig = px.bar(rt_scores,
    x=rt_scores['Streaming Service'],
    y=rt_scores['Rotten Tomatoes Score'],
    title='100% Rotten Tomato Ratings for each service',
    text=rt_scores['Rotten Tomatoes Score'],
    height=600)

fig.update_traces(marker_color='purple', texttemplate='%{text:.2s}', textposition='outside')
fig.show()

9. Overall IMDB rating for all services

In [None]:
fig = px.bar(df,
    x=df['IMDb'].value_counts().index,
    y=df['IMDb'].value_counts(),
    title='Overall IMDb Ratings',
    text=df['IMDb'].value_counts(),
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

10. Countime of Runtime of Movies

In [None]:
RuntimeCount = pd.DataFrame(dict(df['Runtime'].value_counts().sort_values(ascending=False)[:10]).items(),
columns=['Runtime', 'Count'])

RuntimeCount

In [None]:
fig = px.bar(RuntimeCount,
    x=RuntimeCount['Runtime'],
    y=RuntimeCount['Count'],
    title='Overall IMDb Ratings',
    text=RuntimeCount['Runtime'],
    height=600)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

11. Directors and their count of movies they have directed

In [None]:
direc_data = dict(df['Directors'].value_counts())

directors_count = dict()

for director, count in direc_data.items():
    curr_dir = director.split(",")
    for item in curr_dir:
        if item in directors_count.keys():
            directors_count[item] += 1
        else:
            directors_count[item] = count

In [None]:
direc_movie = pd.DataFrame(directors_count.items(), columns=['Director', 'Count'])

direc_movie = direc_movie.sort_values(by='Count', ascending=False)

direc_movie

In [None]:
fig = px.bar(direc_movie,
    x=direc_movie['Director'],
    y=direc_movie['Count'],
    title='Director and their movies count',
    text=direc_movie['Count'],
    height=600)

fig.update_traces(marker_color='purple', texttemplate='%{text:.2s}', textposition='outside')
fig.show()

12. Exploring Genres

In [None]:
genre_data = dict(df['Genres'].value_counts())

genre_count = dict()

for genre, count in genre_data.items():
    curr_genre = genre.split(',')
    for item in curr_genre:
        if item in genre_count:
            genre_count[item] += 1
        else:
            genre_count[item] = count

In [None]:
genre_df = pd.DataFrame(genre_count.items(), columns=['Genre', 'Count'])

genre_df = genre_df.sort_values(by='Count', ascending=False)

genre_df

In [None]:
fig = px.bar(genre_df,
    x=genre_df['Genre'],
    y=genre_df['Count'],
    title='Genre and their movies count',
    text=genre_df['Count'],
    height=600)

fig.update_traces(marker_color='purple', texttemplate='%{text:.2s}', textposition='outside')
fig.show()

13. Top Movies on Netflix

In [None]:
# top_movie_netflix = netflix_df[ (netflix_df['IMDb'] > 8.5) | (netflix_df['New_Rotten_Tomatoes'] > 85) ] 
top_movie_netflix = netflix_df[ netflix_df['IMDb'] > 8.5 ]

top_movie_netflix

In [None]:
fig = px.bar(top_movie_netflix,
    x=top_movie_netflix['Title'],
    y=top_movie_netflix['IMDb'],
    title='Top Movies on Netflix',
    text=top_movie_netflix['IMDb'],
    height=600)

fig.update_traces(marker_color='purple', texttemplate='%{text:.2s}', textposition='outside')
fig.show()