# Dataset Disney+ Shows and Movies

## Verificación del Dataset

In [None]:
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import pandas as pd 

df = pd.read_csv("disney_plus_titles.csv")

# agregar nuevas funciones en el dataset
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,2021-11-26,2016,TV-G,23.0,"Animation, Family",Join Mickey and the gang as they duck the halls!,2021.0,11.0,
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",,2021-11-26,1988,PG,91.0,Comedy,Santa Claus passes his magic bag to a new St. ...,2021.0,11.0,
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,2021-11-26,2011,TV-G,23.0,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.,2021.0,11.0,
3,s4,Movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",,2021-11-26,2021,TV-PG,41.0,Musical,"This is real life, not just fantasy!",2021.0,11.0,
4,s5,TV Show,The Beatles: Get Back,,"John Lennon, Paul McCartney, George Harrison, ...",,2021-11-25,2021,,,"Docuseries, Historical, Music",A three-part documentary from Peter Jackson ca...,2021.0,11.0,1.0


## Columnas disponibles

In [8]:
# Verificar las columnas
print("Columnas disponibles:", df.columns)

Columnas disponibles: Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'year_added', 'month_added', 'season_count'],
      dtype='object')


## Contenido de Type en Disney+

In [None]:
# Agrupar los datos
col = "type"
grouped = df[col].value_counts().reset_index()

# Renombrar las columnas correctamente
grouped.columns = [col, "count"]

# Verificar el resultado
print(grouped)

# Verificar los valores antes de graficar
print("Labels:", grouped[col])
print("Values:", grouped['count'])

init_notebook_mode(connected=True)

# Crear el gráfico
trace = go.Pie(
    labels=grouped[col],
    values=grouped['count'],
    pull=[0.05, 0],
    marker=dict(colors=["#6ad49b", "#a678de"])
)
layout = go.Layout(title="", height=400, legend=dict(x=0.1, y=1.1))
fig = go.Figure(data=[trace], layout=layout)

# Mostrar el gráfico
iplot(fig)

      type  count
0    Movie   1052
1  TV Show    398
Labels: 0      Movie
1    TV Show
Name: type, dtype: object
Values: 0    1052
1     398
Name: count, dtype: int64


In [None]:
# Agregar nuevas características al dataset
df["date_added"] = pd.to_datetime(df['date_added'], errors='coerce')  # Manejar fechas inválidas
df['year_added'] = df['date_added'].dt.year

In [12]:
# Filtrar valores nulos en "year_added"
df = df.dropna(subset=["year_added"])

In [13]:
# Convertir "year_added" a números enteros
df["year_added"] = pd.to_numeric(df["year_added"], errors='coerce').astype('Int64')

In [14]:
print(df["year_added"].unique())

<IntegerArray>
[2021, 2020, 2019]
Length: 3, dtype: Int64



## 2. Growth in content over the years 

In [12]:
# Filtrar por tipo
d1 = df[df["type"] == "TV Show"]
d2 = df[df["type"] == "Movie"]

col = "year_added"

# Procesar datos para TV Shows
vc1 = d1[col].value_counts().reset_index()
vc1.columns = [col, "count"]  
vc1['count'] = pd.to_numeric(vc1['count'], errors='coerce')  
vc1['percent'] = vc1['count'] / vc1['count'].sum() * 100
vc1 = vc1.sort_values(col)

# Procesar datos para Movies
vc2 = d2[col].value_counts().reset_index()
vc2.columns = [col, "count"]  
vc2['count'] = pd.to_numeric(vc2['count'], errors='coerce')  
vc2['percent'] = vc2['count'] / vc2['count'].sum() * 100
vc2 = vc2.sort_values(col)

# Crear el gráfico
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Contenido publicado a lo largo de los años", legend=dict(x=0.1, y=1.1, orientation="h"))

fig = go.Figure(data, layout=layout)

# Mostrar el gráfico
fig.show()


 
 ## 3. Año de estreno de las peliculas

In [26]:
col = 'month_added'

# Procesar datos para TV Shows
vc1 = d1[col].value_counts().reset_index()
vc1.columns = [col, "count"]  
vc1['count'] = pd.to_numeric(vc1['count'], errors='coerce')  
vc1 = vc1.dropna()  
vc1['count'] = vc1['count'].astype(int)  
vc1['percent'] = vc1['count'] / vc1['count'].sum() * 100  
vc1 = vc1.sort_values(col)  # Ordenar por mes

# Crear el gráfico
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="En que més se añadió más contenido?", 
                   xaxis=dict(title="Mes"), 
                   yaxis=dict(title="Numero"), 
                   legend=dict(x=0.1, y=1.1, orientation="h"))

fig = go.Figure(data, layout=layout)

# Mostrar el gráfico
fig.show()


Peliculas más antiguas de Disney+

In [14]:
small = df.sort_values("release_year", ascending = True)
small = small[small['duration'] != ""]
small[['title', "release_year"]][:15]

Unnamed: 0,title,release_year
1220,Steamboat Willie,1928
1178,Santa's Workshop,1932
893,Flowers and Trees,1932
736,Babes in the Woods,1932
1418,Ye Olden Days,1933
1362,Three Little Pigs,1933
1317,The Pied Piper,1933
1254,The Big Bad Wolf,1934
1356,The Wise Little Hen,1934
922,Grasshopper and the Ants,1934


Series mas antiguas de Disney+

In [15]:
small = df.sort_values("release_year", ascending = True)
small = small[small['season_count'] != ""]
small[['title', "release_year"]][:15]

Unnamed: 0,title,release_year
1071,Mickey Mouse Club (1955-59),1955
1244,The Adventures of Spin and Marty,1955
221,Schoolhouse Rock!,1973
299,The Muppet Show,1976
1207,Spider-Woman,1979
1205,Spider-Man and His Amazing Friends,1982
195,Star Wars Vintage: Droids,1985
841,Disney's Adventures Of The Gummi Bears,1985
267,Star Wars: Ewoks,1985
1312,The New Adventures of Winnie the Pooh,1988


There are movies / shows on the platform which were released way back in 1930s and 40s. 

## 4. Content from different Countries

In [16]:
country_codes = {'afghanistan': 'AFG',
 'albania': 'ALB',
 'algeria': 'DZA',
 'american samoa': 'ASM',
 'andorra': 'AND',
 'angola': 'AGO',
 'anguilla': 'AIA',
 'antigua and barbuda': 'ATG',
 'argentina': 'ARG',
 'armenia': 'ARM',
 'aruba': 'ABW',
 'australia': 'AUS',
 'austria': 'AUT',
 'azerbaijan': 'AZE',
 'bahamas': 'BHM',
 'bahrain': 'BHR',
 'bangladesh': 'BGD',
 'barbados': 'BRB',
 'belarus': 'BLR',
 'belgium': 'BEL',
 'belize': 'BLZ',
 'benin': 'BEN',
 'bermuda': 'BMU',
 'bhutan': 'BTN',
 'bolivia': 'BOL',
 'bosnia and herzegovina': 'BIH',
 'botswana': 'BWA',
 'brazil': 'BRA',
 'british virgin islands': 'VGB',
 'brunei': 'BRN',
 'bulgaria': 'BGR',
 'burkina faso': 'BFA',
 'burma': 'MMR',
 'burundi': 'BDI',
 'cabo verde': 'CPV',
 'cambodia': 'KHM',
 'cameroon': 'CMR',
 'canada': 'CAN',
 'cayman islands': 'CYM',
 'central african republic': 'CAF',
 'chad': 'TCD',
 'chile': 'CHL',
 'china': 'CHN',
 'colombia': 'COL',
 'comoros': 'COM',
 'congo democratic': 'COD',
 'Congo republic': 'COG',
 'cook islands': 'COK',
 'costa rica': 'CRI',
 "cote d'ivoire": 'CIV',
 'croatia': 'HRV',
 'cuba': 'CUB',
 'curacao': 'CUW',
 'cyprus': 'CYP',
 'czech republic': 'CZE',
 'denmark': 'DNK',
 'djibouti': 'DJI',
 'dominica': 'DMA',
 'dominican republic': 'DOM',
 'ecuador': 'ECU',
 'egypt': 'EGY',
 'el salvador': 'SLV',
 'equatorial guinea': 'GNQ',
 'eritrea': 'ERI',
 'estonia': 'EST',
 'ethiopia': 'ETH',
 'falkland islands': 'FLK',
 'faroe islands': 'FRO',
 'fiji': 'FJI',
 'finland': 'FIN',
 'france': 'FRA',
 'french polynesia': 'PYF',
 'gabon': 'GAB',
 'gambia, the': 'GMB',
 'georgia': 'GEO',
 'germany': 'DEU',
 'ghana': 'GHA',
 'gibraltar': 'GIB',
 'greece': 'GRC',
 'greenland': 'GRL',
 'grenada': 'GRD',
 'guam': 'GUM',
 'guatemala': 'GTM',
 'guernsey': 'GGY',
 'guinea-bissau': 'GNB',
 'guinea': 'GIN',
 'guyana': 'GUY',
 'haiti': 'HTI',
 'honduras': 'HND',
 'hong kong': 'HKG',
 'hungary': 'HUN',
 'iceland': 'ISL',
 'india': 'IND',
 'indonesia': 'IDN',
 'iran': 'IRN',
 'iraq': 'IRQ',
 'ireland': 'IRL',
 'isle of man': 'IMN',
 'israel': 'ISR',
 'italy': 'ITA',
 'jamaica': 'JAM',
 'japan': 'JPN',
 'jersey': 'JEY',
 'jordan': 'JOR',
 'kazakhstan': 'KAZ',
 'kenya': 'KEN',
 'kiribati': 'KIR',
 'north korea': 'PRK',
 'south korea': 'KOR',
 'kosovo': 'KSV',
 'kuwait': 'KWT',
 'kyrgyzstan': 'KGZ',
 'laos': 'LAO',
 'latvia': 'LVA',
 'lebanon': 'LBN',
 'lesotho': 'LSO',
 'liberia': 'LBR',
 'libya': 'LBY',
 'liechtenstein': 'LIE',
 'lithuania': 'LTU',
 'luxembourg': 'LUX',
 'macau': 'MAC',
 'macedonia': 'MKD',
 'madagascar': 'MDG',
 'malawi': 'MWI',
 'malaysia': 'MYS',
 'maldives': 'MDV',
 'mali': 'MLI',
 'malta': 'MLT',
 'marshall islands': 'MHL',
 'mauritania': 'MRT',
 'mauritius': 'MUS',
 'mexico': 'MEX',
 'micronesia': 'FSM',
 'moldova': 'MDA',
 'monaco': 'MCO',
 'mongolia': 'MNG',
 'montenegro': 'MNE',
 'morocco': 'MAR',
 'mozambique': 'MOZ',
 'namibia': 'NAM',
 'nepal': 'NPL',
 'netherlands': 'NLD',
 'new caledonia': 'NCL',
 'new zealand': 'NZL',
 'nicaragua': 'NIC',
 'nigeria': 'NGA',
 'niger': 'NER',
 'niue': 'NIU',
 'northern mariana islands': 'MNP',
 'norway': 'NOR',
 'oman': 'OMN',
 'pakistan': 'PAK',
 'palau': 'PLW',
 'panama': 'PAN',
 'papua new guinea': 'PNG',
 'paraguay': 'PRY',
 'peru': 'PER',
 'philippines': 'PHL',
 'poland': 'POL',
 'portugal': 'PRT',
 'puerto rico': 'PRI',
 'qatar': 'QAT',
 'romania': 'ROU',
 'russia': 'RUS',
 'rwanda': 'RWA',
 'saint kitts and nevis': 'KNA',
 'saint lucia': 'LCA',
 'saint martin': 'MAF',
 'saint pierre and miquelon': 'SPM',
 'saint vincent and the grenadines': 'VCT',
 'samoa': 'WSM',
 'san marino': 'SMR',
 'sao tome and principe': 'STP',
 'saudi arabia': 'SAU',
 'senegal': 'SEN',
 'serbia': 'SRB',
 'seychelles': 'SYC',
 'sierra leone': 'SLE',
 'singapore': 'SGP',
 'sint maarten': 'SXM',
 'slovakia': 'SVK',
 'slovenia': 'SVN',
 'solomon islands': 'SLB',
 'somalia': 'SOM',
 'south africa': 'ZAF',
 'south sudan': 'SSD',
 'spain': 'ESP',
 'sri lanka': 'LKA',
 'sudan': 'SDN',
 'suriname': 'SUR',
 'swaziland': 'SWZ',
 'sweden': 'SWE',
 'switzerland': 'CHE',
 'syria': 'SYR',
 'taiwan': 'TWN',
 'tajikistan': 'TJK',
 'tanzania': 'TZA',
 'thailand': 'THA',
 'timor-leste': 'TLS',
 'togo': 'TGO',
 'tonga': 'TON',
 'trinidad and tobago': 'TTO',
 'tunisia': 'TUN',
 'turkey': 'TUR',
 'turkmenistan': 'TKM',
 'tuvalu': 'TUV',
 'uganda': 'UGA',
 'ukraine': 'UKR',
 'united arab emirates': 'ARE',
 'united kingdom': 'GBR',
 'united states': 'USA',
 'uruguay': 'URY',
 'uzbekistan': 'UZB',
 'vanuatu': 'VUT',
 'venezuela': 'VEN',
 'vietnam': 'VNM',
 'virgin islands': 'VGB',
 'west bank': 'WBG',
 'yemen': 'YEM',
 'zambia': 'ZMB',
 'zimbabwe': 'ZWE'}

## countries 
from collections import Counter
colorscale = ["#f7fbff", "#ebf3fb", "#deebf7", "#d2e3f3", "#c6dbef", "#b3d2e9", "#9ecae1",
    "#85bcdb", "#6baed6", "#57a0ce", "#4292c6", "#3082be", "#2171b5", "#1361a9",
    "#08519c", "#0b4083", "#08306b"
]
    
def geoplot(ddf):
    country_with_code, country = {}, {}
    shows_countries = ", ".join(ddf['country'].dropna()).split(", ")
    for c,v in dict(Counter(shows_countries)).items():
        code = ""
        if c.lower() in country_codes:
            code = country_codes[c.lower()]
        country_with_code[code] = v
        country[c] = v

    data = [dict(
            type = 'choropleth',
            locations = list(country_with_code.keys()),
            z = list(country_with_code.values()),
            colorscale = [[0,"rgb(5, 10, 172)"],[0.65,"rgb(40, 60, 190)"],[0.75,"rgb(70, 100, 245)"],\
                        [0.80,"rgb(90, 120, 245)"],[0.9,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'gray',
                    width = 0.5
                ) ),
            colorbar = dict(
                autotick = False,
                title = ''),
          ) ]

    layout = dict(
        title = '',
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection = dict(
                type = 'Mercator'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    iplot( fig, validate=False, filename='d3-world-map' )
    return country

country_vals = geoplot(df)
tabs = Counter(country_vals).most_common(25)

labels = [_[0] for _ in tabs][::-1]
values = [_[1] for _ in tabs][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="", marker=dict(color="#a678de"))

data = [trace1]
layout = go.Layout(title="Countries with most content", height=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

## 5. Distribution of Movie Duration

In [None]:
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
import scipy  

df = pd.read_csv("disney_plus_titles.csv")

df['duration'] = df['duration'].astype(str)

d2 = df[df["type"] == "Movie"].copy()

d2['duration'] = d2['duration'].str.extract(r'(\d+)')  
d2['duration'] = pd.to_numeric(d2['duration'], errors='coerce')  

x1 = d2['duration'].dropna()

if len(x1) > 0:
    fig = ff.create_distplot([x1], ['Duración'], bin_size=5, curve_type='normal', colors=["#6ad49b"])
    fig.update_layout(title_text='Distribución de la duración de las Películas')
    fig.show()
else:
    print("No hay datos válidos para graficar la distribución.")


## 6. TV Shows con muchas temporadas

In [None]:
import pandas as pd
import plotly.graph_objects as go

df = pd.read_csv("disney_plus_titles.csv")

df['season_count'] = df['duration'].astype(str).str.extract(r'(\d+)')  
df['season_count'] = pd.to_numeric(df['season_count'], errors='coerce')  

# Filtrar TV Shows
d1 = df[df["type"] == "TV Show"].copy()

# Contar valores por cantidad de temporadas
vc1 = d1['season_count'].value_counts().reset_index()
vc1.columns = ['season_count', "count"]  
vc1 = vc1.dropna()  
vc1['season_count'] = vc1['season_count'].astype(int)  
vc1 = vc1.sort_values('season_count')  

# Crear el gráfico
trace1 = go.Bar(x=vc1['season_count'], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="Number of Seasons in TV Shows", 
                   xaxis=dict(title="Seasons"), 
                   yaxis=dict(title="Count"), 
                   legend=dict(x=0.1, y=1.1, orientation="h"))

fig = go.Figure(data, layout=layout)

# Mostrar el gráfico
fig.show()


## 7. The ratings of the content ? 

In [None]:
col = "rating"

# Procesar datos para TV Shows
vc1 = d1[col].value_counts().reset_index()
vc1.columns = [col, "count"]  
vc1['count'] = pd.to_numeric(vc1['count'], errors='coerce')  
vc1 = vc1.dropna()  
vc1['count'] = vc1['count'].astype(int)  
vc1['percent'] = vc1['count'] / vc1['count'].sum() * 100  
vc1 = vc1.sort_values(col)

# Procesar datos para Movies
vc2 = d2[col].value_counts().reset_index()
vc2.columns = [col, "count"]  
vc2['count'] = pd.to_numeric(vc2['count'], errors='coerce')  
vc2 = vc2.dropna()  
vc2['count'] = vc2['count'].astype(int)  
vc2['percent'] = vc2['count'] / vc2['count'].sum() * 100  
vc2 = vc2.sort_values(col)

# Crear el gráfico
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]

layout = go.Layout(
    title="Distribución del Rating en TV Shows y Movies",
    xaxis=dict(title="Rating"),
    yaxis=dict(title="Count"),
    legend=dict(x=0.1, y=1.1, orientation="h")
)

fig = go.Figure(data, layout=layout)

# Mostrar el gráfico
fig.show()


## 8. What are the top Categories ?

In [22]:
col = "listed_in"
categories = ", ".join(d2['listed_in']).split(", ")
counter_list = Counter(categories).most_common(50)
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="#a678de"))

data = [trace1]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

## 9. Top Actors on Disney+ with Most Movies

In [23]:
def country_trace(country, flag = "movie"):
    df["from_us"] = df['country'].fillna("").apply(lambda x : 1 if country.lower() in x.lower() else 0)
    small = df[df["from_us"] == 1]
    if flag == "movie":
        small = small[small["duration"] != ""]
    else:
        small = small[small["season_count"] != ""]
    cast = ", ".join(small['cast'].fillna("")).split(", ")
    tags = Counter(cast).most_common(25)
    tags = [_ for _ in tags if "" != _[0]]

    labels, values = [_[0]+"  " for _ in tags], [_[1] for _ in tags]
    trace = go.Bar(y=labels[::-1], x=values[::-1], orientation="h", name="", marker=dict(color="#a678de"))
    return trace

from plotly.subplots import make_subplots
traces = []
titles = ["United States", "","Australia","", "United Kingdom", "Canada","", "Spain","", "Japan"]
for title in titles:
    if title != "":
        traces.append(country_trace(title))

fig = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)
fig.add_trace(traces[2], 1,5)
fig.add_trace(traces[3], 2,1)
fig.add_trace(traces[4], 2,3)
fig.add_trace(traces[5], 2,5)

fig.update_layout(height=1200, showlegend=False)
fig.show()

## 9. Top Actors on Disney+ with Most TV Shows

In [28]:
traces = []
titles = ["United States","", "United Kingdom"]
for title in titles:
    if title != "":
        traces.append(country_trace(title, flag="tv_shows"))

fig = make_subplots(rows=1, cols=3, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)

fig.update_layout(height=600, showlegend=False)
fig.show()

In [26]:
small = df[df["type"] == "Movie"]
small = small[small["country"] == "United States"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
counter_list = Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Directores de cine en EE.UU. con mas contenido", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()