In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
df = pd.read_csv("movie_data.csv")
df

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16739,16740,The Ghosts of Buxley Hall,1980,,6.2,,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
16740,16741,The Poof Point,2001,7+,4.7,,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
16741,16742,Sharks of Lost Island,2013,,5.7,,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
16742,16743,Man Among Cheetahs,2017,,6.6,,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Movie count of different platforms

In [44]:
#Platforms
netflix = len(df[df['Netflix']==1])
hulu = len(df[df['Hulu']==1])
prime = len(df[df['Prime Video']==1])
disney = len(df[df['Disney+']==1])
Platform = ['Netflix','Hulu','Prime Video','Disney+']
Count = [netflix, hulu, prime, disney]

fig = px.pie(names = Platform,
             values = Count,
             title='Movie Count Of Different Platforms',
            color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
plt.savefig('Movie_count.png')

<Figure size 640x480 with 0 Axes>

## Platforms with IMDB 8.5+ movies

In [45]:
# Let's create new dataframes for each platform
netflix_movies = df.loc[df['Netflix'] == 1].drop(['Hulu', 'Prime Video', 'Disney+', 'Type'],axis=1)
hulu_movies = df.loc[df['Hulu'] == 1].drop(['Netflix', 'Prime Video', 'Disney+', 'Type'],axis=1)
prime_video_movies = df.loc[df['Prime Video'] == 1].drop(['Netflix','Hulu', 'Disney+', 'Type'],axis=1)
disney_movies = df.loc[df['Disney+'] == 1].drop(['Netflix','Hulu', 'Prime Video', 'Type'],axis=1)

# Define our minimum ratings
min_rating = 8.5

# Get the number of movies that have IMDB scores above our minimum rating
count_imdb = [len(netflix_movies[netflix_movies['IMDb']>min_rating]),
              len(hulu_movies[hulu_movies['IMDb']>min_rating]),
              len(prime_video_movies[prime_video_movies['IMDb']>min_rating]),
              len(disney_movies[disney_movies['IMDb']>min_rating])]
# List of platform names
platform = ['Netflix','Hulu','Prime Video','Disney+']

# Create our dataset we'll visualize below
top_rated = pd.DataFrame({'Platforms':platform,'Count':count_imdb})

# Generate our bar chart
fig = px.bar(top_rated,
             x='Platforms',
             y='Count',
             color='Count',
             color_continuous_scale='Rainbow',
             title='IMDB 8.5+ Movies on different Platforms')
fig.show()
plt.savefig('IMDb8.5+moviecount.png')


<Figure size 640x480 with 0 Axes>

## Look at the percentage of IMDB 8.5+ rating across platforms

In [46]:
# Get the number of movies that have IMDB scores above our minimum rating
percent_imdb = [len(netflix_movies[netflix_movies['IMDb']>min_rating])/len(netflix_movies),
              len(hulu_movies[hulu_movies['IMDb']>min_rating])/len(hulu_movies),
              len(prime_video_movies[prime_video_movies['IMDb']>min_rating])/len(prime_video_movies),
              len(disney_movies[disney_movies['IMDb']>min_rating])/len(disney_movies)]
# List of platform names
platform = ['Netflix','Hulu','Prime Video','Disney+']

# Create our dataset we'll visualize below
top_rated = pd.DataFrame({'Platforms':platform,'percentage':percent_imdb})

# Generate our bar chart
fig = px.bar(top_rated,
             x='Platforms',
             y='percentage',
             color='percentage',
             color_continuous_scale='Rainbow',
             title='percentage of IMDB 8.5+ Movies on different Platforms')
fig.show()
plt.savefig('IMDb8.5+moviepercentage.png')

<Figure size 640x480 with 0 Axes>

## Display top 10 movies on each platform

In [47]:
n = netflix_movies.sort_values('IMDb',ascending=False).head(10)
h = hulu_movies.sort_values('IMDb',ascending=False).head(10)
p = prime_video_movies.sort_values('IMDb',ascending=False).head(10)
d = disney_movies.sort_values('IMDb',ascending=False).head(10)

fig = make_subplots(rows=4, cols=1,subplot_titles=("Top 10 Movies on Netflix","Top 10 Movies on Hulu",
                                                   "Top 10 Movies on Prime Video","Top 10 Movies on Disney"))

fig.add_trace(go.Bar(y=n['Title'],x=n['IMDb'],orientation='h',marker=dict(color=n['IMDb'],coloraxis="coloraxis"))
             ,row=1,col=1)
fig.add_trace(go.Bar(y=h['Title'],x=h['IMDb'],orientation='h',marker=dict(color=h['IMDb'], coloraxis="coloraxis")),row=2,col=1)
fig.add_trace(go.Bar(y=p['Title'],x=p['IMDb'],orientation='h',marker=dict(color=p['IMDb'], coloraxis="coloraxis")),row=3,col=1)
fig.add_trace(go.Bar(y=d['Title'],x=d['IMDb'],orientation='h',marker=dict(color=d['IMDb'], coloraxis="coloraxis")),row=4,col=1)

fig.update_layout(height=1300, width=1000, title_text="Top Movies on Different Platforms based on IMDB Rating",
                  coloraxis=dict(colorscale='Rainbow'),showlegend=False)
fig.show()
plt.savefig('topratingmovies.png')

<Figure size 640x480 with 0 Axes>

## Number of movies produced in each year available on different platform

In [48]:
#Year
year_count = df.groupby('Year')['Title'].count()
year_movie = df.groupby('Year')[["Prime Video",'Netflix',"Hulu","Disney+"]].sum()
year_data = pd.concat([year_count,year_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
#year_data.head()
#year_data.tail()
# Generate Plot
fig = px.bar(year_data,
             x='Year',
             y= ['Netflix','Prime Video',"Hulu","Disney+"],
             title='Movie Count By produced Year across platform')
fig.show()
plt.savefig('moviecountbyproducedyear.png')

<Figure size 640x480 with 0 Axes>

## Top 10 genres across the 4 platforms(bar chart)

In [49]:
gen_count = df.groupby('Genres')['Title'].count()
gen_movie = df.groupby('Genres')[['Netflix']].sum()
gen_data = pd.concat([gen_count,gen_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
# Sort by top ten
gen_data = gen_data.sort_values('Netflix',ascending=False)[:10]

fig = px.bar(gen_data,
             x='Genres',
             y='Netflix',
             hover_data=['Netflix'],
             color='Netflix',
             color_continuous_scale='Rainbow',
             title='Top 10 Genres Movie Count on Netflix')
fig.show()
plt.savefig('topgenreonNetflix.png')

<Figure size 640x480 with 0 Axes>

In [14]:
gen_count = df.groupby('Genres')['Title'].count()
gen_movie = df.groupby('Genres')[['Prime Video']].sum()
gen_data = pd.concat([gen_count,gen_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
# Sort by top ten
gen_data = gen_data.sort_values('Prime Video',ascending=False)[:10]

fig = px.bar(gen_data,
             x='Genres',
             y='Prime Video',
             hover_data=['Prime Video'],
             color='Prime Video',
             color_continuous_scale='Rainbow',
             title='Top 10 Genres Movie Count on Prime Video')
fig.show()
plt.savefig('/Users/jiangyunhui/Documents/GitHub/STAT4011_Project1/figures/pca_3.png')

In [50]:
gen_count = df.groupby('Genres')['Title'].count()
gen_movie = df.groupby('Genres')[['Hulu']].sum()
gen_data = pd.concat([gen_count,gen_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
# Sort by top ten
gen_data = gen_data.sort_values('Hulu',ascending=False)[:10]

fig = px.bar(gen_data,
             x='Genres',
             y='Hulu',
             hover_data=['Hulu'],
             color='Hulu',
             color_continuous_scale='Rainbow',
             title='Top 10 Genres Movie Count on Hulu')
fig.show()
plt.savefig('topgenreonprime.png')

<Figure size 640x480 with 0 Axes>

In [51]:
gen_count = df.groupby('Genres')['Title'].count()
gen_movie = df.groupby('Genres')[['Disney+']].sum()
gen_data = pd.concat([gen_count,gen_movie],axis=1).reset_index().rename(columns={'Title':'Movie Count'})
# Sort by top ten
gen_data = gen_data.sort_values('Disney+',ascending=False)[:10]

fig = px.bar(gen_data,
             x='Genres',
             y='Disney+',
             hover_data=['Disney+'],
             color='Disney+',
             color_continuous_scale='Rainbow',
             title='Top 10 Genres Movie Count on Disney+')
fig.show()
plt.savefig('topgenreonhulu.png')

<Figure size 640x480 with 0 Axes>

## Where are the best documentary available(top 100 in IMDb rating)

In [54]:
topdoc = df[df["Documentary"]==1].sort_values("IMDb",ascending=False)[:50]
net = sum(topdoc["Netflix"])
prime = sum(topdoc["Prime Video"])
hu = sum(topdoc["Hulu"])
dis = sum(topdoc["Disney+"])
Platform = ['Netflix','Hulu','Prime Video','Disney+']
Count = [net, hu, prime, dis]

fig = px.pie(names = Platform,
             values = Count,
             title='Top 50 documentaries Count Of Different Platforms',
            color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
plt.savefig('top50doc.png')

<Figure size 640x480 with 0 Axes>

## Where are the best action movies available

In [55]:
topdoc = df[df["Action"]==1].sort_values("IMDb",ascending=False)[:50]
net = sum(topdoc["Netflix"])
prime = sum(topdoc["Prime Video"])
hu = sum(topdoc["Hulu"])
dis = sum(topdoc["Disney+"])
Platform = ['Netflix','Hulu','Prime Video','Disney+']
Count = [net, hu, prime, dis]

fig = px.pie(names = Platform,
             values = Count,
             title='Top 50 action movies Count Of Different Platforms',
            color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
plt.savefig('top50action.png')

<Figure size 640x480 with 0 Axes>

## Where are the best animation available

In [56]:
topdoc = df[df["Animation"]==1].sort_values("IMDb",ascending=False)[:50]
net = sum(topdoc["Netflix"])
prime = sum(topdoc["Prime Video"])
hu = sum(topdoc["Hulu"])
dis = sum(topdoc["Disney+"])
Platform = ['Netflix','Hulu','Prime Video','Disney+']
Count = [net, hu, prime, dis]

fig = px.pie(names = Platform,
             values = Count,
             title='Top 50 animation Count Of Different Platforms',
            color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
plt.savefig('topanimation.png')

<Figure size 640x480 with 0 Axes>

## Where are the best Adventure available

In [60]:
topdoc = df[df["Adventure"]==1].sort_values("IMDb",ascending=False)[:50]
net = sum(topdoc["Netflix"])
prime = sum(topdoc["Prime Video"])
hu = sum(topdoc["Hulu"])
dis = sum(topdoc["Disney+"])
Platform = ['Netflix','Hulu','Prime Video','Disney+']
Count = [net, hu, prime, dis]

fig = px.pie(names = Platform,
             values = Count,
             title='Top 50 adventure Count Of Different Platforms',
            color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
plt.savefig('top50adventure.png')

<Figure size 640x480 with 0 Axes>

## Exclusive content

In [62]:
net = df[df["Netflix"]==1]
leng = len(net)
net  = net[net["Prime Video"]==0]
net  = net[net["Hulu"]==0]
net = net[net["Disney+"]==0]
net = len(net)
per = round(net/leng*100,3)
print(f"{per}% of movies on Netflix are exclusive to the platform.")

89.551% of movies on Netflix are exclusive to the platform.


In [63]:
net = df[df["Prime Video"]==1]
leng = len(net)
net  = net[net["Netflix"]==0]
net  = net[net["Hulu"]==0]
net = net[net["Disney+"]==0]
net = len(net)
per = round(net/leng*100,3)
print(f"{per}% of movies on Prime video are exclusive to the platform.")

95.176% of movies on Prime video are exclusive to the platform.


In [65]:
net = df[df["Hulu"]==1]
leng = len(net)
net  = net[net["Prime Video"]==0]
net  = net[net["Netflix"]==0]
net = net[net["Disney+"]==0]
net = len(net)
per = round(net/leng*100,3)
print(f"{per}% of movies on Hulu are exclusive to the platform.")

70.764% of movies on Hulu are exclusive to the platform.


In [66]:
net = df[df["Disney+"]==1]
leng = len(net)
net  = net[net["Prime Video"]==0]
net  = net[net["Hulu"]==0]
net = net[net["Netflix"]==0]
net = len(net)
per = round(net/leng*100,3)
print(f"{per}% of movies on Disney+ are exclusive to the platform.")

94.326% of movies on Disney+ are exclusive to the platform.
