In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)



## NETFLIX


In [2]:
dfnetflix=pd.read_csv("./data/Datasets/netflix_titles.csv")

netflix=dfnetflix[["type","title","date_added"]]
netflix=netflix[netflix["date_added"].isnull()==False] #10 null (from 8k)

#To datetime
netflix.date_added=pd.to_datetime(netflix.date_added) 

#Dataset with valid values from 2018
netflix=netflix.loc[netflix["date_added"]>"2018"]

print(netflix.shape)
netflix.head(6)


(6987, 3)


Unnamed: 0,type,title,date_added
0,Movie,Dick Johnson Is Dead,2021-09-25
1,TV Show,Blood & Water,2021-09-24
2,TV Show,Ganglands,2021-09-24
3,TV Show,Jailbirds New Orleans,2021-09-24
4,TV Show,Kota Factory,2021-09-24
5,TV Show,Midnight Mass,2021-09-24


Netflix merge Imdb Top 250

In [3]:
#Para evaluar cuantas peliculas o series son top, haremos un merge con el dataframe que hemos obtenido  mediante web scrapping.

IMDBP=pd.read_csv("./data/Datasets/IMDBP", sep= "\t", index_col=0)

IMDBS=pd.read_csv("./data/Datasets/IMDBS", sep= "\t",index_col=0)

Net_=pd.merge(netflix,IMDBP, how="outer")
flix_=pd.merge(netflix,IMDBS, how="outer") 

#Delete values that are in IMBD and not in Netflix (I could also have done a left join)
Netflix_=flix_[flix_["date_added"].notna()]


Netflix_

Unnamed: 0,type,title,date_added,Rates,NVotes
0,Movie,Dick Johnson Is Dead,2021-09-25,,
1,TV Show,Blood & Water,2021-09-24,,
2,TV Show,Ganglands,2021-09-24,,
3,TV Show,Jailbirds New Orleans,2021-09-24,,
4,TV Show,Kota Factory,2021-09-24,8.69,71680.0
...,...,...,...,...,...
6982,Movie,Zodiac,2019-11-20,,
6983,TV Show,Zombie Dumb,2019-07-01,,
6984,Movie,Zombieland,2019-11-01,,
6985,Movie,Zoom,2020-01-11,,


In [4]:
#GroupBy in 15 days
date15nethistoric=Netflix_.groupby(pd.Grouper(key="date_added", freq="SMS"))


#Visualization
df=date15nethistoric.count()

trace1 = go.Scatter(
                    x = df.index,
                    y = df['type'],
                    name = 'Netflix',
                    mode= 'lines',
                    marker = dict(color = 'rgba(200, 2, 2, 0.8)'))
fig=go.Figure(trace1)
iplot(fig)

In [5]:
#GroupBy Visuals

Netflixf=Netflix_[Netflix_["type"]=="Movie"]

groupfilm=Netflixf.groupby(pd.Grouper(key="date_added", freq="SMS"))
gf=groupfilm.count()

filmbar = go.Bar(
                    x = gf.index,
                    y = gf['type'],
                    name = 'Netflix',
                  
                    marker = dict(color = 'rgba(200, 2, 2, 0.8)'))
filmfig=go.Figure(filmbar)
iplot(filmfig)

In [6]:

Netflixf=Netflix_[Netflix_["type"]=="TV Show"]

groupfilm=Netflixf.groupby(pd.Grouper(key="date_added", freq="SMS"))
gf=groupfilm.count()

filmbar = go.Bar(
                    x = gf.index,
                    y = gf['type'],
                    name = 'Netflix',
                  
                    marker = dict(color = 'rgba(200, 2, 2, 0.8)'))
filmfig=go.Figure(filmbar)
iplot(filmfig)

In [7]:


netflix21=Netflix_.loc[Netflix_["date_added"]>"2020"]

date15_21net=netflix21.groupby(pd.Grouper(key="date_added", freq="SMS"))

df=date15_21net.count()

trace1 = go.Scatter(
                    x = df.index,
                    y = df['title'],
                    name = 'Netflix',
                    mode= 'lines',
                    marker = dict(color = 'rgba(200, 2, 2, 0.8)'))
fig=go.Figure(trace1)
iplot(fig)



In [8]:
#Dataset has too many nulls, I wont use it

dfamazon=pd.read_csv("./data/Datasets/amazon_prime_titles.csv")
amazon=dfamazon[["type","title","date_added"]]
amazon=amazon[amazon["date_added"].isnull()]  #Tiene muchos valores NaN (prescindo)
amazon


Unnamed: 0,type,title,date_added
16,Movie,Zoombies,
17,TV Show,Zoo Babies,
18,TV Show,Zoë Coombs Marr: Bossy Bottom,
19,Movie,Zoe,
20,TV Show,Zoboomafoo,
...,...,...,...
9663,Movie,Pride Of The Bowery,
9664,TV Show,Planet Patrol,
9665,Movie,Outpost,
9666,TV Show,Maradona: Blessed Dream,


## HULU

In [9]:
#Same that with Netflix

dfhulu=pd.read_csv("./data/Datasets/hulu_titles.csv")
hulu=dfhulu[["type","title","date_added"]]
hulu=hulu[hulu["date_added"].isnull()==False] #30 null values 
hulu.date_added=pd.to_datetime(hulu.date_added)
hulu=hulu.loc[hulu["date_added"]>"2020"]
hu_=pd.merge(hulu,IMDBP, how="outer")
lu_=pd.merge(hulu,IMDBS, how="outer")
Hulu_=lu_[lu_["date_added"].notna()]


date15hul=hulu.groupby(pd.Grouper(key="date_added", freq="SM"))



In [10]:
date15hul=Hulu_.groupby(pd.Grouper(key="date_added", freq="SMS"))


dfe=date15hul.count()

trace2 = go.Scatter(
                    x = dfe.index,
                    y = dfe['title'],
                    name = 'Hulu',
                    mode= 'lines',
                    marker = dict(color = 'rgba(45, 11, 26, 0.8)'))
fig=go.Figure(trace2)
iplot(fig)

In [11]:
#HULU & NETFLIX

data=[trace1,trace2]
fig=go.Figure(data=data)
iplot(fig)

## DISNEY +

In [12]:
#Disney has lots of values in 2019, when the platform launched, so it was decided to start the analysis in 2020 to see launching patterns

dfdisney=pd.read_csv("./data/Datasets/disney_plus_titles.csv")
disney=dfdisney[["type","title","date_added"]]
disney=disney[disney["date_added"].isnull()==False] #3 null values
disney.date_added=pd.to_datetime(disney.date_added) 



disney=disney.loc[disney["date_added"]>"2020"]


dis_=pd.merge(disney,IMDBP, how="outer")
ney_=pd.merge(disney,IMDBS, how="outer")

Disney_=ney_[ney_["date_added"].notna()]

Disney_[Disney_["Rates"].notna()]


Unnamed: 0,type,title,date_added,Rates,NVotes
4,TV Show,The Beatles: Get Back,2021-11-25,8.87,22311.0
92,TV Show,The Simpsons,2021-09-29,8.66,404732.0
377,TV Show,The Mandalorian,2020-10-30,8.68,469587.0
474,TV Show,Cosmos: A Spacetime Odyssey,2020-07-10,9.2,121731.0


In [13]:
date15dis=Disney_.groupby(pd.Grouper(key="date_added", freq="SMS"))

dfd=date15dis.count()

trace3 = go.Scatter(
                    x = dfd.index,
                    y = dfd['title'],
                    name = "Disney",
                    mode= 'lines',
                    marker = dict(color = 'rgba(2, 11, 172, 0.8)'))


In [14]:
#HULU NETFLIX DISNEY+
data=[trace1,trace2,trace3]
fig=go.Figure(data=data)
iplot(fig)

In [161]:
#In order to show top films launched:

trace11 = go.Bar(
                    x = df.index,
                    y = df['Rates'],
                    name = 'Netflix',
                    
                    marker = dict(color = 'rgba(200, 2, 2, 0.8)'))
trace21 = go.Bar(
                    x = dfe.index,
                    y = dfe['Rates'],
                    name = 'Hulu',
                   
                    marker = dict(color = 'rgba(45, 11, 26, 0.8)'))
trace31 = go.Bar(
                    x = dfd.index,
                    y = dfd['Rates'],
                    name = "Disney",
                   
                    marker = dict(color = 'rgba(2, 11, 172, 0.8)'))

datatop=[trace11,trace21,trace31]
fig=go.Figure(datatop)
iplot(fig)