# ETL PI MLOps

In [269]:
# Se importan las librerias a usar 
import pandas as pd
import numpy as np

In [270]:
# Se crea el dataset, a partir de los csv
df_peliculas = pd.read_csv('movies_dataset.csv')
df_creditos = pd.read_csv('credits.csv')

  df_peliculas = pd.read_csv('movies_dataset.csv')


# Transformación

1.- Algunos campos, como belongs_to_collection, production_companies y otros (ver diccionario de datos) están anidados, esto es o bien tienen un diccionario o una lista como valores en cada fila, ¡deberán desanidarlos para poder y unirlos al dataset de nuevo hacer alguna de las consultas de la API! O bien buscar la manera de acceder a esos datos sin desanidarlos.

In [271]:
# Desanidamos belongs to collection
# Se rellenar valores vacios para poder trabajar mejor
df_peliculas['belongs_to_collection'] = df_peliculas['belongs_to_collection'].fillna('{}')
df_peliculas.reset_index(drop=True, inplace=True) 
# Se estructura y organizan los datos desanidados
df_collection = pd.json_normalize(df_peliculas['belongs_to_collection'].map(eval))

# Renombramos la columna 'name' como 'belongs_to_collection'
df_collection.rename(columns={'name': 'belongs_to_collection'}, inplace=True)

# Obtenemos solo la columna belongs_to_collection
df_collection = df_collection['belongs_to_collection'].copy()

# Visualimos el nuevo DataFrame
df_collection

0                  Toy Story Collection
1                                   NaN
2             Grumpy Old Men Collection
3                                   NaN
4        Father of the Bride Collection
                      ...              
45461                               NaN
45462                               NaN
45463                               NaN
45464                               NaN
45465                               NaN
Name: belongs_to_collection, Length: 45466, dtype: object

In [272]:
# Desanidamos genres
# Se estructura y organizan los datos desanidados
df_genres = pd.json_normalize(df_peliculas['genres'].map(eval))

# Se obtiene la columna de genres a partir de los datos de name
df_genres['genres'] = df_genres.apply(lambda x: ','.join([genre['name'] for genre in x if pd.notna(genre)]), axis=1)

# Se crea un nuevo DataFrame solo con la columna de genres
df_genres = df_genres[['genres']].copy()

# Visualizamos el nuevo DataFrame
df_genres

Unnamed: 0,genres
0,"Animation,Comedy,Family"
1,"Adventure,Fantasy,Family"
2,"Romance,Comedy"
3,"Comedy,Drama,Romance"
4,Comedy
...,...
45461,"Drama,Family"
45462,Drama
45463,"Action,Drama,Thriller"
45464,


In [273]:
# Desanidamos production companies
# Se estructura y organizan los datos desanidados
df_companies = pd.json_normalize(df_peliculas['production_companies'].map(eval))

# Se obtiene la columna de production countries
df_companies['production_companies'] = df_companies.apply(lambda x: ','.join([country['name'] for country in x if pd.notna(country)]), axis=1)

# Se crea un nuevo DataFrame solo con la columna
df_companies = df_companies[['production_companies']].copy()

# Visualizamos el nuevo DataFrame
df_companies

TypeError: eval() arg 1 must be a string, bytes or code object

In [274]:
# Desanidamos production countries
# Se estructura y organizan los datos desanidados
df_countries = pd.json_normalize(df_peliculas['production_countries'].map(eval))

# Se obtiene la columna de production countries
df_countries['production_countries'] = df_countries.apply(lambda x: ','.join([country['name'] for country in x if pd.notna(country)]), axis=1)

# Se crea un nuevo DataFrame solo con la columna
df_countries = df_countries[['production_countries']].copy()

# Visualizamos el nuevo DataFrame
df_countries

TypeError: eval() arg 1 must be a string, bytes or code object

In [275]:
# Desanidamos spoken languages
# Se estructura y organizan los datos desanidados
df_languages = pd.json_normalize(df_peliculas['spoken_languages'].map(eval))

# Se obtiene la columna de spoken languages
df_languages['spoken_languages'] = df_languages.apply(lambda x: ','.join([country['name'] for country in x if pd.notna(country)]), axis=1)

# Se crea un nuevo DataFrame solo con la columna
df_languages = df_languages[['spoken_languages']].copy()

# Visualizamos el nuevo DataFrame
df_languages

TypeError: eval() arg 1 must be a string, bytes or code object

In [276]:
# Eliminamos las columnas anidadas, para posterior sustituirlas por las mismas columnas ya desanidadas
df_peliculas = df_peliculas.drop(columns=['belongs_to_collection','spoken_languages', 'production_countries', 'genres', 'production_companies'])
df_completo = pd.concat([df_peliculas, df_collection, df_genres, df_companies, df_countries, df_languages], axis=1)
df_completo

Unnamed: 0,adult,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,...,tagline,title,video,vote_average,vote_count,belongs_to_collection,genres,production_companies,production_countries,spoken_languages
0,False,30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,...,,Toy Story,False,7.7,5415.0,Toy Story Collection,"Animation,Comedy,Family",Pixar Animation Studios,United States of America,English
1,False,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,,"Adventure,Fantasy,Family","TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,"English,Français"
2,False,0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Grumpy Old Men Collection,"Romance,Comedy","Warner Bros.,Lancaster Gate",United States of America,English
3,False,16000000,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,,"Comedy,Drama,Romance",Twentieth Century Fox Film Corporation,United States of America,English
4,False,0,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Father of the Bride Collection,Comedy,"Sandollar Productions,Touchstone Pictures",United States of America,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,0,http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,...,Rising and falling between a man and woman,Subdue,False,4.0,1.0,,"Drama,Family",,Iran,فارسی
45462,False,0,,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,...,,Century of Birthing,False,9.0,3.0,,Drama,Sine Olivia,Philippines,
45463,False,0,,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,...,A deadly game of wits.,Betrayal,False,3.8,6.0,,"Action,Drama,Thriller",American World Pictures,United States of America,English
45464,False,0,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,...,,Satan Triumphant,False,0.0,0.0,,,Yermoliev,Russia,


2.- Los valores nulos de los campos revenue, budget deben ser rellenados por el número 0.

In [277]:
# Se rellenan los valores nulos
df_completo['budget'] = df_completo['budget'].fillna(0)
df_completo['revenue'] = df_completo['revenue'].fillna(0)

3.- Los valores nulos del campo release date deben eliminarse.

In [278]:
# Contamos los datos nulos en release date
df_completo['release_date'].isnull().sum()

87

In [279]:
# Eliminamos los datos nulos y contamos nuevamente para asegurarnos que fueron eliminados
df_completo = df_completo.dropna(subset=['release_date'])
df_completo['release_date'].isnull().sum()

0

4.- De haber fechas, deberán tener el formato AAAA-mm-dd, además deberán crear la columna release_year donde extraerán el año de la fecha de estreno.

In [287]:
# Se convierte la columna de fechas a tipo datetime
df_completo['release_date'] = pd.to_datetime(df_completo['release_date'], errors='coerce')

# Se extrae el año de la fecha de lanzamiento
df_completo['release_year'] = df_completo['release_date'].dt.year

# Verificamos que se haya agregado correctamente
df_completo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completo['release_date'] = pd.to_datetime(df_completo['release_date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completo['release_year'] = df_completo['release_date'].dt.year


Unnamed: 0,adult,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,...,video,vote_average,vote_count,belongs_to_collection,genres,production_companies,production_countries,spoken_languages,release_year,return
0,False,30000000.0,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,...,False,7.7,5415.0,Toy Story Collection,"Animation,Comedy,Family",Pixar Animation Studios,United States of America,English,1995.0,12.45
1,False,65000000.0,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,...,False,6.9,2413.0,,"Adventure,Fantasy,Family","TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,"English,Français",1995.0,4.04
2,False,0.0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,...,False,6.5,92.0,Grumpy Old Men Collection,"Romance,Comedy","Warner Bros.,Lancaster Gate",United States of America,English,1995.0,0.00
3,False,16000000.0,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,...,False,6.1,34.0,,"Comedy,Drama,Romance",Twentieth Century Fox Film Corporation,United States of America,English,1995.0,5.09
4,False,0.0,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,...,False,5.7,173.0,Father of the Bride Collection,Comedy,"Sandollar Productions,Touchstone Pictures",United States of America,English,1995.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45460,False,0.0,,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",5.683753,/fQC46NglNiEMZBv5XHoyLuOWoN5.jpg,...,False,5.7,26.0,,"Drama,Action,Romance","Westdeutscher Rundfunk (WDR),Working Title Fil...","Canada,Germany,United Kingdom,United States of...",English,1991.0,0.00
45462,False,0.0,,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,...,False,9.0,3.0,,Drama,Sine Olivia,Philippines,,2011.0,0.00
45463,False,0.0,,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,...,False,3.8,6.0,,"Action,Drama,Thriller",American World Pictures,United States of America,English,2003.0,0.00
45464,False,0.0,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,...,False,0.0,0.0,,,Yermoliev,Russia,,2017.0,0.00


5.- Crear la columna con el retorno de inversión, llamada return con los campos revenue y budget, dividiendo estas dos últimas revenue / budget, cuando no hay datos disponibles para calcularlo, deberá tomar el valor 0.

In [288]:
# Se convierten las columnas 'revenue' y 'budget' a tipo numérico
df_completo['revenue'] = pd.to_numeric(df_completo['revenue'], errors='coerce')
df_completo['budget'] = pd.to_numeric(df_completo['budget'], errors='coerce')

# Se crea la columna 'return' y se calcula el retorno de inversión
df_completo['return'] = df_completo['revenue'].div(df_completo['budget'], fill_value=0)

# Se establece 0 en los casos donde budget sea 0 o haya valores faltantes en revenue o budget
faltantes = (df_completo['budget'] == 0) | (df_completo[['revenue', 'budget']].isnull().any(axis=1))
df_completo.loc[faltantes, 'return'] = 0

# Se visualiza para asegurarse que los datos estan correctos
df_completo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completo['revenue'] = pd.to_numeric(df_completo['revenue'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completo['budget'] = pd.to_numeric(df_completo['budget'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completo['return'] = df_completo['revenue'].

Unnamed: 0,adult,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,...,video,vote_average,vote_count,belongs_to_collection,genres,production_companies,production_countries,spoken_languages,release_year,return
0,False,30000000.0,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,...,False,7.7,5415.0,Toy Story Collection,"Animation,Comedy,Family",Pixar Animation Studios,United States of America,English,1995.0,12.451801
1,False,65000000.0,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,...,False,6.9,2413.0,,"Adventure,Fantasy,Family","TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,"English,Français",1995.0,4.043035
2,False,0.0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,...,False,6.5,92.0,Grumpy Old Men Collection,"Romance,Comedy","Warner Bros.,Lancaster Gate",United States of America,English,1995.0,0.000000
3,False,16000000.0,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,...,False,6.1,34.0,,"Comedy,Drama,Romance",Twentieth Century Fox Film Corporation,United States of America,English,1995.0,5.090760
4,False,0.0,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,...,False,5.7,173.0,Father of the Bride Collection,Comedy,"Sandollar Productions,Touchstone Pictures",United States of America,English,1995.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45460,False,0.0,,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",5.683753,/fQC46NglNiEMZBv5XHoyLuOWoN5.jpg,...,False,5.7,26.0,,"Drama,Action,Romance","Westdeutscher Rundfunk (WDR),Working Title Fil...","Canada,Germany,United Kingdom,United States of...",English,1991.0,0.000000
45462,False,0.0,,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,...,False,9.0,3.0,,Drama,Sine Olivia,Philippines,,2011.0,0.000000
45463,False,0.0,,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,...,False,3.8,6.0,,"Action,Drama,Thriller",American World Pictures,United States of America,English,2003.0,0.000000
45464,False,0.0,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,...,False,0.0,0.0,,,Yermoliev,Russia,,2017.0,0.000000


6.- Eliminar las columnas que no serán utilizadas, video,imdb_id,adult,original_title,poster_path y homepage.