<a href="https://colab.research.google.com/github/Davidarr96/Pryecto_MLOps/blob/main/ETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
 # Vinculamos google colab con google drive para importar el dataset
 from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**ANALISIS EXPLORATORIOS DE LOS DATOS**







In [3]:
# Importamos librerias
import pandas as pd


In [4]:
# Ingresamos los datasets y guardamos cada uno con una variable distinta
amazon_df = pd.read_csv('/content/drive/MyDrive/plataforma/amazon_prime_titles.csv', sep=',', encoding='latin-1')
disney_df = pd.read_csv('/content/drive/MyDrive/plataforma/disney_plus_titles.csv', sep=',', encoding='latin-1')
hulu_df = pd.read_csv('/content/drive/MyDrive/plataforma/hulu_titles.csv', sep=',', encoding='latin-1')
netflix_df = pd.read_csv('/content/drive/MyDrive/plataforma/netflix_titles.csv', sep=',', encoding='latin-1')



# **Transformaciones requeridas:**

-Generar campo id: Cada id se compondrá de la primera letra del nombre 
de la plataforma, seguido del show_id ya presente en los datasets (ejemplo para títulos de Amazon = as123)

-Los valores nulos del campo rating deberán reemplazarse por el string “G” (corresponde al maturity rating: “general for all audiences”

-De haber fechas, deberán tener el formato AAAA-mm-dd

-Los campos de texto deberán estar en minúsculas, sin excepciones

-El campo duration debe convertirse en dos campos: duration_int y duration_type. El primero será un integer y el segundo un string indicando la unidad de medición de duración: min (minutos) o season (temporadas)




In [5]:
#Generamos el campo ID
amazon_df["Id"] = "a" + amazon_df["show_id"]
disney_df["Id"] = 'd' + disney_df["show_id"]
hulu_df["Id"] = 'h' + hulu_df["show_id"]
netflix_df["Id"] = 'n' + netflix_df["show_id"]

In [6]:
#Reemplazamos valores nulos del campo Rating por "g"
amazon_df["rating"].fillna(value ="g", inplace = True)
disney_df["rating"].fillna(value ="g", inplace = True)
hulu_df["rating"].fillna(value ="g", inplace = True)
netflix_df["rating"].fillna(value ="g", inplace = True)

In [7]:
#Convertimos la columna data_added al formato adecuado (AAAA-mm-dd)
amazon_df["date_added"] = pd.to_datetime(amazon_df.date_added)
disney_df["date_added"] = pd.to_datetime(disney_df.date_added)
hulu_df["date_added"] = pd.to_datetime(hulu_df.date_added)
netflix_df["date_added"] = pd.to_datetime(netflix_df.date_added)

In [8]:
#Convertimos los campos de textos minusculas, vamos a utilizar el metodo "applymap"
amazon_df = amazon_df.applymap(lambda s: s.lower() if type(s) == str else s)
disney_df = disney_df.applymap(lambda s: s.lower() if type(s) == str else s)
hulu_df = hulu_df.applymap(lambda s: s.lower() if type(s) == str else s)
netflix_df = netflix_df.applymap(lambda s: s.lower() if type(s) == str else s)

In [9]:
#Separamos la columnas "duration" en dos: ("duration_int" y "duration_type")
amazon_df["duration_int"] = amazon_df["duration"].str.split(" ").str.get(0)
amazon_df["duration_type"] = amazon_df["duration"].str.split(" ").str.get(1)
disney_df["duration_int"] = disney_df["duration"].str.split(" ").str.get(0)
disney_df["duration_type"] = disney_df["duration"].str.split(" ").str.get(1)
hulu_df["duration_int"] = hulu_df["duration"].str.split(" ").str.get(0)
hulu_df["duration_type"] = hulu_df["duration"].str.split(" ").str.get(1)
netflix_df["duration_int"] = netflix_df["duration"].str.split(" ").str.get(0)
netflix_df["duration_type"] = netflix_df["duration"].str.split(" ").str.get(1)

In [10]:
#En nueva columna "duration_type" unificar season y seasons en "season":
amazon_df["duration_type"] = amazon_df["duration_type"].replace(["Seasons", "seasons"], "season")
disney_df["duration_type"] = disney_df["duration_type"].replace(["Seasons", "seasons"], "season")
hulu_df["duration_type"] = hulu_df["duration_type"].replace(["Seasons", "seasons"], "season")
netflix_df["duration_type"] = netflix_df["duration_type"].replace(["Seasons", "seasons"], "season")

In [11]:
#Unificamos las 4 plataformas a través de la función “concat” en un dataframe único "plataformas_df", facilitando el desarrollo de las consultas
plataformas_df = pd.concat([amazon_df, disney_df, hulu_df, netflix_df])

In [12]:
#Chequeamos como queda nuestro datasets unificado:
plataformas_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,Id,duration_int,duration_type
0,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,113 min,"comedy, drama",a small fishing village must procure a local d...,as1,113,min
1,s2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,110 min,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min
2,s3,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,74 min,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min
3,s4,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,69 min,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min
4,s5,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,45 min,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min


In [13]:
# Cambio tipo de dato "duration_int" a integer
plataformas_df["duration_int"] = plataformas_df["duration_int"].astype("Int64")
# Eliminamos las columnas sobrantes
plataformas_df.drop(['show_id'], axis=1, inplace=True)
plataformas_df.drop(['duration'], axis=1, inplace=True)
plataformas_df.head()

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,Id,duration_int,duration_type
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min


In [14]:
# Verificamos que campos tienen valores nulos

plataformas_df.isna().sum()

type                 0
title                0
director          8259
cast              5321
country          11499
date_added        9554
release_year         0
rating               0
listed_in            0
description          4
Id                   0
duration_int       482
duration_type      482
dtype: int64

In [15]:
# Completamos los valores nulos del campo "duration_int" con 0

plataformas_df['duration_int'].fillna(0, inplace=True)
plataformas_df['duration_int'].isna().sum()

0

In [16]:
# Llenamos los campos vacios con tipo de dato string con la leyenda "sin dato"

plataformas_df['director'].fillna('sin dato', inplace=True)
plataformas_df['cast'].fillna('sin dato', inplace=True)
plataformas_df['country'].fillna('sin dato', inplace=True)
plataformas_df['duration_type'].fillna('sin dato', inplace=True)
plataformas_df
     

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,Id,duration_int,duration_type
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,movie,zodiac,david fincher,"mark ruffalo, jake gyllenhaal, robert downey j...",united states,2019-11-20,2007,r,"cult movies, dramas, thrillers","a political cartoonist, a crime reporter and a...",ns8803,158,min
8803,tv show,zombie dumb,sin dato,sin dato,sin dato,2019-07-01,2018,tv-y7,"kids' tv, korean tv shows, tv comedies","while living alone in a spooky town, a young g...",ns8804,2,season
8804,movie,zombieland,ruben fleischer,"jesse eisenberg, woody harrelson, emma stone, ...",united states,2019-11-01,2009,r,"comedies, horror movies",looking to survive in a world taken over by zo...,ns8805,88,min
8805,movie,zoom,peter hewitt,"tim allen, courteney cox, chevy chase, kate ma...",united states,2020-01-11,2006,pg,"children & family movies, comedies","dragged from civilian life, a former superhero...",ns8806,88,min


In [17]:
# Verificamos si en duration_type, que aparezcan valores unicos.

plataformas_df_type = plataformas_df['duration_type'].unique()
plataformas_df_type

array(['min', 'season', 'sin dato'], dtype=object)

In [18]:
# Unificamos los valores de seasons

plataformas_df['duration_type'].replace({'season': 'seasons'}, inplace=True)
plataformas_df 

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,Id,duration_int,duration_type
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,movie,zodiac,david fincher,"mark ruffalo, jake gyllenhaal, robert downey j...",united states,2019-11-20,2007,r,"cult movies, dramas, thrillers","a political cartoonist, a crime reporter and a...",ns8803,158,min
8803,tv show,zombie dumb,sin dato,sin dato,sin dato,2019-07-01,2018,tv-y7,"kids' tv, korean tv shows, tv comedies","while living alone in a spooky town, a young g...",ns8804,2,seasons
8804,movie,zombieland,ruben fleischer,"jesse eisenberg, woody harrelson, emma stone, ...",united states,2019-11-01,2009,r,"comedies, horror movies",looking to survive in a world taken over by zo...,ns8805,88,min
8805,movie,zoom,peter hewitt,"tim allen, courteney cox, chevy chase, kate ma...",united states,2020-01-11,2006,pg,"children & family movies, comedies","dragged from civilian life, a former superhero...",ns8806,88,min


In [19]:

# Verificamos que los valores únicos del campo 'duration_time' sean correctos

plataformas_df_type = plataformas_df['duration_type'].unique()
plataformas_df_type

array(['min', 'seasons', 'sin dato'], dtype=object)

In [20]:
# Verificamos que no hayan quedado valores nulos, salvo los campos con formato fecha (date)

plataformas_df.isna().sum()
     

type                0
title               0
director            0
cast                0
country             0
date_added       9554
release_year        0
rating              0
listed_in           0
description         4
Id                  0
duration_int        0
duration_type       0
dtype: int64

In [21]:

# Levantamos cada csv con los rating de las peliculas
rating1 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/1.csv', sep=',', encoding='latin-1')
rating2 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/2.csv', sep=',', encoding='latin-1')
rating3 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/3.csv', sep=',', encoding='latin-1')
rating4 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/4.csv', sep=',', encoding='latin-1')
rating5 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/5.csv', sep=',', encoding='latin-1')
rating6 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/6.csv', sep=',', encoding='latin-1')
rating7 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/7.csv', sep=',', encoding='latin-1')
rating8 = pd.read_csv('/content/drive/MyDrive/plataforma/rating/8.csv', sep=',', encoding='latin-1') 

print(rating1.shape)
print(rating2.shape)
print(rating3.shape)
print(rating4.shape)
print(rating5.shape)
print(rating6.shape)
print(rating7.shape)
print(rating8.shape)


(1500000, 4)
(1500000, 4)
(1500000, 4)
(1500000, 4)
(1500000, 4)
(1500000, 4)
(524289, 4)
(1500000, 4)


In [22]:
# Unimos los dataset de ratings
data_score = pd.concat([rating1, rating2, rating3, rating4, rating5, rating6, rating7, rating8])
data_score.shape

(11024289, 4)

In [23]:
# Verificamos si tienen valores nulos o faltantes

data_score.isna().sum()

userId       0
rating       0
timestamp    0
movieId      0
dtype: int64

In [24]:
# Creamos un nuevo dataset para agrupar el rating promedio por pelicula

data_score_prom = data_score.groupby(['movieId']).mean()
data_score_prom

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
as1,99178.623506,3.467131,1.173241e+09
as10,92915.883041,3.439571,1.176385e+09
as100,83483.262791,3.609302,1.165458e+09
as1000,90606.197938,3.556701,1.168386e+09
as1001,92349.085288,3.585288,1.179517e+09
...,...,...,...
ns995,86366.176339,3.515625,1.176070e+09
ns996,94741.060729,3.626518,1.159029e+09
ns997,86417.856842,3.530526,1.162038e+09
ns998,94409.502066,3.582645,1.173053e+09


In [25]:
# Unimos ambos dataset, mientras que el id de cada uno coincida

data_plataformas_prom = pd.merge(left=plataformas_df, right=data_score_prom, how='left', left_on='Id', right_on='movieId')
data_plataformas_prom.head()

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating_x,listed_in,description,Id,duration_int,duration_type,userId,rating_y,timestamp
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min,99178.623506,3.467131,1173241000.0
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min,93837.476673,3.548682,1168607000.0
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min,84095.315166,3.5,1176941000.0
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min,88177.389006,3.538055,1167142000.0
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min,83381.165966,3.478992,1167345000.0


In [26]:
# Eliminamos las columnas que nos sobran (userId y timestamp)

data_plataformas_prom.drop(['userId'], axis=1, inplace=True)
data_plataformas_prom.drop(['timestamp'], axis=1, inplace=True)
data_plataformas_prom

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating_x,listed_in,description,Id,duration_int,duration_type,rating_y
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min,3.467131
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min,3.548682
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min,3.500000
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min,3.538055
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min,3.478992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22993,movie,zodiac,david fincher,"mark ruffalo, jake gyllenhaal, robert downey j...",united states,2019-11-20,2007,r,"cult movies, dramas, thrillers","a political cartoonist, a crime reporter and a...",ns8803,158,min,3.438998
22994,tv show,zombie dumb,sin dato,sin dato,sin dato,2019-07-01,2018,tv-y7,"kids' tv, korean tv shows, tv comedies","while living alone in a spooky town, a young g...",ns8804,2,seasons,3.515947
22995,movie,zombieland,ruben fleischer,"jesse eisenberg, woody harrelson, emma stone, ...",united states,2019-11-01,2009,r,"comedies, horror movies",looking to survive in a world taken over by zo...,ns8805,88,min,3.420945
22996,movie,zoom,peter hewitt,"tim allen, courteney cox, chevy chase, kate ma...",united states,2020-01-11,2006,pg,"children & family movies, comedies","dragged from civilian life, a former superhero...",ns8806,88,min,3.588050


In [27]:
# Cambiamos el nombre de la columna rating por score

data_plataformas_prom.rename(columns={'rating_y':'score'}, inplace=True)
data_plataformas_prom

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating_x,listed_in,description,Id,duration_int,duration_type,score
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min,3.467131
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min,3.548682
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min,3.500000
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min,3.538055
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min,3.478992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22993,movie,zodiac,david fincher,"mark ruffalo, jake gyllenhaal, robert downey j...",united states,2019-11-20,2007,r,"cult movies, dramas, thrillers","a political cartoonist, a crime reporter and a...",ns8803,158,min,3.438998
22994,tv show,zombie dumb,sin dato,sin dato,sin dato,2019-07-01,2018,tv-y7,"kids' tv, korean tv shows, tv comedies","while living alone in a spooky town, a young g...",ns8804,2,seasons,3.515947
22995,movie,zombieland,ruben fleischer,"jesse eisenberg, woody harrelson, emma stone, ...",united states,2019-11-01,2009,r,"comedies, horror movies",looking to survive in a world taken over by zo...,ns8805,88,min,3.420945
22996,movie,zoom,peter hewitt,"tim allen, courteney cox, chevy chase, kate ma...",united states,2020-01-11,2006,pg,"children & family movies, comedies","dragged from civilian life, a former superhero...",ns8806,88,min,3.588050


In [28]:
# Redondeamos los campos float a 1 solo decimal

data_plataformas_prom = data_plataformas_prom.round(1)
data_plataformas_prom

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating_x,listed_in,description,Id,duration_int,duration_type,score
0,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,"comedy, drama",a small fishing village must procure a local d...,as1,113,min,3.5
1,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,"drama, international",a metro family decides to fight a cyber crimin...,as2,110,min,3.5
2,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,2021-03-30,2017,g,"action, drama, suspense",after a man discovers his wife is cheating on ...,as3,74,min,3.5
3,movie,pink: staying true,sonia anderson,"interviews with: pink, adele, beyoncã©, britne...",united states,2021-03-30,2014,g,documentary,"pink breaks the mold once again, bringing her ...",as4,69,min,3.5
4,movie,monster maker,giles foster,"harry dean stanton, kieran o'brien, george cos...",united kingdom,2021-03-30,1989,g,"drama, fantasy",teenage matt banting wants to work with a famo...,as5,45,min,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22993,movie,zodiac,david fincher,"mark ruffalo, jake gyllenhaal, robert downey j...",united states,2019-11-20,2007,r,"cult movies, dramas, thrillers","a political cartoonist, a crime reporter and a...",ns8803,158,min,3.4
22994,tv show,zombie dumb,sin dato,sin dato,sin dato,2019-07-01,2018,tv-y7,"kids' tv, korean tv shows, tv comedies","while living alone in a spooky town, a young g...",ns8804,2,seasons,3.5
22995,movie,zombieland,ruben fleischer,"jesse eisenberg, woody harrelson, emma stone, ...",united states,2019-11-01,2009,r,"comedies, horror movies",looking to survive in a world taken over by zo...,ns8805,88,min,3.4
22996,movie,zoom,peter hewitt,"tim allen, courteney cox, chevy chase, kate ma...",united states,2020-01-11,2006,pg,"children & family movies, comedies","dragged from civilian life, a former superhero...",ns8806,88,min,3.6


In [29]:
# Info general del dataset final

data_plataformas_prom.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22998 entries, 0 to 22997
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   type           22998 non-null  object        
 1   title          22998 non-null  object        
 2   director       22998 non-null  object        
 3   cast           22998 non-null  object        
 4   country        22998 non-null  object        
 5   date_added     13444 non-null  datetime64[ns]
 6   release_year   22998 non-null  int64         
 7   rating_x       22998 non-null  object        
 8   listed_in      22998 non-null  object        
 9   description    22994 non-null  object        
 10  Id             22998 non-null  object        
 11  duration_int   22998 non-null  Int64         
 12  duration_type  22998 non-null  object        
 13  score          22998 non-null  float64       
dtypes: Int64(1), datetime64[ns](1), float64(1), int64(1), object(10)
memor

In [30]:
# Generamos el csv que trabajamos con los datasets de las plataformas y el score promedio para usarlo en la Api

data_plataformas_prom.to_csv('/content/drive/MyDrive/plataforma/datasets/plataformas_prom.csv', index=False)

In [31]:
# Ahora debemos dejar listo el dataset con los score, sin filtrar datos, con el formato correcto

data_score.head()

Unnamed: 0,userId,rating,timestamp,movieId
0,1,1.0,1425941529,as680
1,1,4.5,1425942435,ns2186
2,1,5.0,1425941523,hs2381
3,1,5.0,1425941546,ns3663
4,1,5.0,1425941556,as9500


In [32]:
# Pasamos a cambiar el formato fecha unix a formato fecha aaaa-mm-dd

data_score['timestamp'] = pd.to_datetime(data_score['timestamp'], unit='s').dt.strftime('%Y-%m-%d')
data_score

Unnamed: 0,userId,rating,timestamp,movieId
0,1,1.0,2015-03-09,as680
1,1,4.5,2015-03-09,ns2186
2,1,5.0,2015-03-09,hs2381
3,1,5.0,2015-03-09,ns3663
4,1,5.0,2015-03-09,as9500
...,...,...,...,...
1499995,124380,4.5,2007-12-04,ns5272
1499996,124380,2.5,2007-12-04,ns5492
1499997,124380,3.5,2007-12-04,hs305
1499998,124380,4.5,2007-12-04,ns7881


In [33]:
# Verificamos si existen registros duplicados

data_score.duplicated().sum()

10466

In [34]:
# Eliminamos los valores duplicados

data_score.drop_duplicates(inplace=True)

In [35]:
# Verificamos que la cant de registros se hayan reducido en 10466

print("Nueva cant registros:", data_score.shape[0])

Nueva cant registros: 11013823


In [36]:
# Verificamos el tipo de dato de cada columna, y el espacio que ocupará el dataset en memoria

data_score.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11013823 entries, 0 to 1499999
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   rating     float64
 2   timestamp  object 
 3   movieId    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 420.1+ MB


In [37]:
# Verificamos que existan valores unicos y de un solo tipo de dato en la columna rating

data_score['rating'].unique()

array([1. , 4.5, 5. , 4. , 3.5, 2.5, 0.5, 3. , 2. , 1.5])

In [38]:
# Generamos el csv que trabajamos con los datasets de ratings

data_score.to_csv('/content/drive/MyDrive/plataforma/datasets/score.csv', index=False)