### ETL del archivo en crudo `movies_dataset.parquet` de la columna `belongs_to_collection`

In [1]:
import pandas as pd
import ast
import os

In [2]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"

movies_dataset = pd.read_parquet(url, 
                                 engine="fastparquet")

movies_dataset.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [3]:
belongs_to_collection = movies_dataset[['belongs_to_collection', 
                                        'id']].copy()

belongs_to_collection.iloc[0]

belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
id                                                                     862
Name: 0, dtype: object

In [4]:
belongs_to_collection.rename(columns={'id': 'movie_id'}, 
                             inplace=True)

belongs_to_collection.iloc[0]

belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
movie_id                                                               862
Name: 0, dtype: object

In [5]:
belongs_to_collection.iloc[0]['belongs_to_collection']

"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}"

In [6]:
belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   belongs_to_collection  4494 non-null   object
 1   movie_id               45466 non-null  object
dtypes: object(2)
memory usage: 710.5+ KB


In [7]:
belongs_to_collection.dropna(subset='belongs_to_collection', 
                             inplace=True)

belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4494 entries, 0 to 45382
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   belongs_to_collection  4494 non-null   object
 1   movie_id               4494 non-null   object
dtypes: object(2)
memory usage: 105.3+ KB


In [8]:
dict_series = belongs_to_collection['belongs_to_collection'].apply(ast.literal_eval)

dict_series.iloc[0]

{'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}

In [9]:
only_belongs_to_collection = dict_series.apply(pd.Series)

only_belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4494 entries, 0 to 45382
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4491 non-null   float64
 1   name           4491 non-null   object 
 2   poster_path    3948 non-null   object 
 3   backdrop_path  3263 non-null   object 
 4   0              3 non-null      float64
dtypes: float64(2), object(3)
memory usage: 210.7+ KB


In [10]:
belongs_to_collection = pd.concat([only_belongs_to_collection, 
                                   belongs_to_collection['movie_id']], 
                                   axis=1)

belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4494 entries, 0 to 45382
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4491 non-null   float64
 1   name           4491 non-null   object 
 2   poster_path    3948 non-null   object 
 3   backdrop_path  3263 non-null   object 
 4   0              3 non-null      float64
 5   movie_id       4494 non-null   object 
dtypes: float64(2), object(4)
memory usage: 245.8+ KB


In [11]:
belongs_to_collection.drop(columns=[0, 'poster_path'], 
                           inplace=True)

belongs_to_collection.iloc[0]

id                                        10194.0
name                         Toy Story Collection
backdrop_path    /9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
movie_id                                      862
Name: 0, dtype: object

In [12]:
belongs_to_collection.dropna(subset='id', 
                             inplace=True)

belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4491 entries, 0 to 45382
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4491 non-null   float64
 1   name           4491 non-null   object 
 2   backdrop_path  3263 non-null   object 
 3   movie_id       4491 non-null   object 
dtypes: float64(1), object(3)
memory usage: 175.4+ KB


In [13]:
belongs_to_collection.reset_index(drop=True, 
                                  inplace=True)

belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4491 entries, 0 to 4490
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4491 non-null   float64
 1   name           4491 non-null   object 
 2   backdrop_path  3263 non-null   object 
 3   movie_id       4491 non-null   object 
dtypes: float64(1), object(3)
memory usage: 140.5+ KB


In [14]:
belongs_to_collection['id'] = belongs_to_collection['id'].astype('str')

belongs_to_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4491 entries, 0 to 4490
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             4491 non-null   object
 1   name           4491 non-null   object
 2   backdrop_path  3263 non-null   object
 3   movie_id       4491 non-null   object
dtypes: object(4)
memory usage: 140.5+ KB


In [15]:
belongs_to_collection.rename(columns={'id':'collection_id', 
                                      'name':'collection_name'}, 
                                      inplace=True)

belongs_to_collection.iloc[0]

collection_id                               10194.0
collection_name                Toy Story Collection
backdrop_path      /9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
movie_id                                        862
Name: 0, dtype: object

In [16]:
current_dir = os.getcwd()

current_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\notebooks\\ETL\\movies_dataset\\nested_data'

In [17]:
base_dir = os.path.dirname(
    os.path.dirname(
        os.path.dirname(
            os.path.dirname(current_dir))))

base_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas'

In [18]:
file_path = os.path.join(base_dir, 
                         'data', 
                         'ETL_data', 
                         'movies_dataset', 
                         'previous_nested_data', 
                         'belongs_to_collection.parquet')

file_path

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\data\\ETL_data\\movies_dataset\\previous_nested_data\\belongs_to_collection.parquet'

In [19]:
belongs_to_collection.to_parquet(file_path)