# Limpieza del dataframe disney_plus_titles.csv
## Junio 2025
## by Francis

### Importar funciones generales

In [23]:
import sys
sys.path.append(r"C:\Users\FRANCIS\Documents\Proyecto_Streaming\scripts") 
import utils
import pandas as pd

### Carga del Dataframe

In [24]:
df_disney = utils.cargar_datos(r"C:\Users\FRANCIS\Documents\Proyecto_Streaming\data\disney_plus_titles.csv")

### Inspección del DataFrame

In [4]:
df_disney.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!
1,s2,Movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",,"November 26, 2021",1988,PG,91 min,Comedy,Santa Claus passes his magic bag to a new St. ...
2,s3,Movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",United States,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.
3,s4,Movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",,"November 26, 2021",2021,TV-PG,41 min,Musical,"This is real life, not just fantasy!"
4,s5,TV Show,The Beatles: Get Back,,"John Lennon, Paul McCartney, George Harrison, ...",,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music",A three-part documentary from Peter Jackson ca...


In [6]:
df_disney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1450 non-null   object
 1   type          1450 non-null   object
 2   title         1450 non-null   object
 3   director      977 non-null    object
 4   cast          1260 non-null   object
 5   country       1231 non-null   object
 6   date_added    1447 non-null   object
 7   release_year  1450 non-null   int64 
 8   rating        1447 non-null   object
 9   duration      1450 non-null   object
 10  listed_in     1450 non-null   object
 11  description   1450 non-null   object
dtypes: int64(1), object(11)
memory usage: 136.1+ KB


### Convertir columna con fechas a datetime

In [25]:
df_disney= utils.convertir_a_fecha(df_disney, ['date_added'])

In [8]:
df_disney.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

### Ver porcentaje de valores nulos

In [9]:
utils.resumen_nulos(df_disney)

Unnamed: 0,cantidad_nulos,porcentaje_nulos
director,473,32.62069
country,219,15.103448
cast,190,13.103448
date_added,3,0.206897
rating,3,0.206897
show_id,0,0.0
type,0,0.0
title,0,0.0
release_year,0,0.0
duration,0,0.0


### Evaluamos en las columnas que no son importantes para los objetivos de los análisis y que tienen valores nulos 

In [26]:
evaluar_director = utils.evaluar_fila(df_disney, "director", ["date_added", "rating", "show_id","listed_in", "country", "type"])
evaluar_director.to_string() #sí hay suficientes nulos para eliminar filas (70%< es not null)

'    show_id     type                     title director                                                       cast country date_added  release_year rating  duration                      listed_in                                                                                        description\n4        s5  TV Show     The Beatles: Get Back      NaN  John Lennon, Paul McCartney, George Harrison, Ringo Starr     NaN 2021-11-25          2021    NaN  1 Season  Docuseries, Historical, Music  A three-part documentary from Peter Jackson capturing a moment in music history with The Beatles.\n280    s281  TV Show  Marvel Studios ASSEMBLED      NaN                                                        NaN     NaN 2021-03-12          2021    NaN  1 Season          Anthology, Docuseries             ASSEMBLED is an immersive series of docu-specials examining the next phase of the MCU.'

In [27]:
df_disney = df_disney.drop(evaluar_director.index)

In [28]:
evaluar_cast = utils.evaluar_fila(df_disney, "cast", ["date_added", "rating", "show_id", "listed_in", "country", "type"])
evaluar_cast.to_string() #no hay suficientes nulos para eliminar filas (>70% es not null)

'Empty DataFrame\nColumns: [show_id, type, title, director, cast, country, date_added, release_year, rating, duration, listed_in, description]\nIndex: []'

### Reemplazamos en todas las columnas object del DataFrame los datos nulos por "Desconocido"

In [29]:
columnas_object= ['country', 'rating',"director", "cast"]
df_disney[columnas_object]= df_disney[columnas_object].fillna("Desconocido")

### Se convierte date_added a int para poder calcular la moda y mediana dentro de sus datos

In [30]:
df_disney['date_added']= df_disney['date_added'].astype('int64')

In [15]:
df_disney.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added       int64
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

### Se aplica la función que reemplaza los valores nulos en date_added por la mediana o moda de la agrupación de datos 

In [None]:
df_disney = utils.elegir_imputacion(df_disney, columna="date_added", agrupacion=["listed_in", "type", "country"], umbral_dispersion=60)

### Se verifica si hay valores extraños o valores nulos después del reemplazo

In [None]:
print(df_disney["date_added"].unique())  
print(df_disney["date_added"].isnull().sum())  

### Los valores están en formato int con nanosegundos: se hace la conversión a segundos con timestamp para convertir a datetime

In [33]:
# Filtrar valores extremos antes de la conversión
df_disney = df_disney[df_disney["date_added"] // 10**9 > 0]  
df_disney = df_disney[df_disney["date_added"] // 10**9 < 1893456000] 

df_disney["date_added"] = pd.to_datetime(df_disney["date_added"] // 10**9, unit="s", errors="coerce")


### Se verifica aleatoriamente los datos, se confirma si hay valores nulos todavía y si la conversión a datetime fue realizada

In [None]:
print(df_disney[['date_added']].sample(10))
print(df_disney['date_added'].isnull().sum())
df_disney.dtypes

### Se crean las columnas derivadas de date_added y se convierten a tipo int

In [35]:
df_disney['year_added']= df_disney['date_added'].dt.year.astype("Int64")
df_disney['month_added']= df_disney['date_added'].dt.month.astype("Int64")

In [None]:
print(df_disney["date_added"].isnull().sum())  # ¿Quedan valores nulos después de todo?
print(df_disney[["date_added", "month_added", "year_added"]].sample(10))  # Revisión rápida

In [37]:
utils.resumen_nulos(df_disney)

Unnamed: 0,cantidad_nulos,porcentaje_nulos
show_id,0,0.0
type,0,0.0
title,0,0.0
director,0,0.0
cast,0,0.0
country,0,0.0
date_added,0,0.0
release_year,0,0.0
rating,0,0.0
duration,0,0.0


### Datos duplicados

In [None]:
df_disney.duplicated().sum() #No hay datos duplicados

0

### Estandarizar los datos según la columna

In [39]:
df_disney = utils.limpiar_texto(df_disney, ["type", "country", "rating", "listed_in","title", "director", "cast","duration", "description"], conservar_mayusculas=["title", "director", "cast","duration", "description"]) 

In [40]:
df_disney.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added
0,s1,movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",desconocido,2021-11-26,2016,tv-g,23 min,"animation, family",Join Mickey and the gang as they duck the halls!,2021,11
1,s2,movie,Ernest Saves Christmas,John Cherry,"Jim Varney, Noelle Parker, Douglas Seale",desconocido,2021-11-26,1988,pg,91 min,comedy,Santa Claus passes his magic bag to a new St. ...,2021,11
2,s3,movie,Ice Age: A Mammoth Christmas,Karen Disher,"Raymond Albert Romano, John Leguizamo, Denis L...",united states,2021-11-26,2011,tv-g,23 min,"animation, comedy, family",Sid the Sloth is on Santa's naughty list.,2021,11
3,s4,movie,The Queen Family Singalong,Hamish Hamilton,"Darren Criss, Adam Lambert, Derek Hough, Alexa...",desconocido,2021-11-26,2021,tv-pg,41 min,musical,"This is real life, not just fantasy!",2021,11
5,s6,movie,Becoming Cousteau,Liz Garbus,"Jacques Yves Cousteau, Vincent Cassel",united states,2021-11-24,2021,pg-13,94 min,"biographical, documentary",An inside look at the legendary life of advent...,2021,11


In [41]:
df_disney.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
year_added               Int64
month_added              Int64
dtype: object

### Guardar el DataFrame limpio

In [42]:
df_disney.to_csv("disney_limpio.csv", index=False)  
df_disney.to_parquet("disney_limpio.parquet")  