### ETL del archivo en crudo `movies_dataset.parquet` de la columna `genres`

In [45]:
import pandas as pd
import ast
import os

In [46]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"

movies_dataset_df = pd.read_parquet(url, engine="fastparquet")

movies_dataset_df.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [47]:
genres_df = movies_dataset_df[['genres','id']].copy()

genres_df.iloc[0]

genres    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
id                                                      862
Name: 0, dtype: object

In [48]:
genres_df.rename(columns={'id': 'movie_id'}, inplace=True)

genres_df.iloc[0]

genres      [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
movie_id                                                  862
Name: 0, dtype: object

In [49]:
genres_df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [50]:
genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genres    45466 non-null  object
 1   movie_id  45466 non-null  object
dtypes: object(2)
memory usage: 710.5+ KB


In [51]:
genres_list_dict = [
    {**genres_dict, 'movie_id': row['movie_id']}
    for _, row in genres_df.iterrows()
    for genres_dict in ast.literal_eval(row['genres'])
]

In [52]:
modified_genres_df = pd.DataFrame(genres_list_dict)

modified_genres_df.iloc[0]

id                 16
name        Animation
movie_id          862
Name: 0, dtype: object

In [53]:
modified_genres_df.rename(columns={'id': 'genre_id', 
                                   'name': 'genre_name'}, 
                          inplace=True)

modified_genres_df.iloc[0]

genre_id             16
genre_name    Animation
movie_id            862
Name: 0, dtype: object

In [54]:
modified_genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91106 entries, 0 to 91105
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre_id    91106 non-null  int64 
 1   genre_name  91106 non-null  object
 2   movie_id    91106 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.1+ MB


In [55]:
modified_genres_df['genre_id'] = modified_genres_df['genre_id'].astype('str')

modified_genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91106 entries, 0 to 91105
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre_id    91106 non-null  object
 1   genre_name  91106 non-null  object
 2   movie_id    91106 non-null  object
dtypes: object(3)
memory usage: 2.1+ MB


In [56]:
current_dir = os.getcwd()

current_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\notebooks\\ETL\\movies_dataset'

In [57]:
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))

base_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas'

In [58]:
file_path = os.path.join(base_dir, 'data', 'ETL_data', 'movies_dataset', 'genres.parquet')

file_path

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\data\\ETL_data\\movies_dataset\\genres.parquet'

In [59]:
modified_genres_df.to_parquet(file_path)