### ETL del archivo en crudo `movies_dataset.parquet` de la columna `production_countries`

In [21]:
import pandas as pd
import ast
import os

In [22]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"

movies_dataset_df = pd.read_parquet(url, engine="fastparquet")

movies_dataset_df.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [23]:
production_countries_df = movies_dataset_df[['production_countries','id']].copy()

production_countries_df.iloc[0]

production_countries    [{'iso_3166_1': 'US', 'name': 'United States o...
id                                                                    862
Name: 0, dtype: object

In [24]:
production_countries_df.rename(columns={'id': 'movie_id'}, inplace=True)

production_countries_df.iloc[0]

production_countries    [{'iso_3166_1': 'US', 'name': 'United States o...
movie_id                                                              862
Name: 0, dtype: object

In [25]:
production_countries_df.iloc[0]['production_countries']

"[{'iso_3166_1': 'US', 'name': 'United States of America'}]"

In [26]:
production_countries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   production_countries  45463 non-null  object
 1   movie_id              45466 non-null  object
dtypes: object(2)
memory usage: 710.5+ KB


In [27]:
production_countries_df.dropna(subset=['production_countries'], inplace=True)

production_countries_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45463 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   production_countries  45463 non-null  object
 1   movie_id              45463 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [28]:
try:
    
    production_countries_list_dict = [
        {**production_countries_dict, 'movie_id': row['movie_id']}
        for _, row in production_countries_df.iterrows()
        for production_countries_dict in ast.literal_eval(row['production_countries'])
    ]
    
except Exception as ex:
    
    print("Ha habido una excepción", ex)

Ha habido una excepción 'float' object is not iterable


In [29]:
def is_float(object):
    try:
        
        float(object)
        return True
    
    except ValueError:
        
        return False


production_countries_df = production_countries_df[~production_countries_df['production_countries']
                                                  .apply(is_float)]

production_countries_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   production_countries  45460 non-null  object
 1   movie_id              45460 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [30]:
try:
    
    production_countries_list_dict = [
        {**production_countries_dict, 'movie_id': row['movie_id']}
        for _, row in production_countries_df.iterrows()
        for production_countries_dict in ast.literal_eval(row['production_countries'])
    ]
    
except Exception as ex:
    
    print("Ha habido una excepción", ex)

In [31]:
modified_production_countries_df = pd.DataFrame(production_countries_list_dict)

modified_production_countries_df.iloc[0]

iso_3166_1                          US
name          United States of America
movie_id                           862
Name: 0, dtype: object

In [32]:
modified_production_countries_df.rename(columns={'name': 'country_name'}, 
                                        inplace=True)

modified_production_countries_df.iloc[0]

iso_3166_1                            US
country_name    United States of America
movie_id                             862
Name: 0, dtype: object

In [33]:
modified_production_countries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49423 entries, 0 to 49422
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   iso_3166_1    49423 non-null  object
 1   country_name  49423 non-null  object
 2   movie_id      49423 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [34]:
current_dir = os.getcwd()

current_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\notebooks\\ETL\\movies_dataset'

In [35]:
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))

base_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas'

In [36]:
file_path = os.path.join(base_dir, 'data', 'ETL_data', 'movies_dataset', 'production_countries.parquet')

file_path

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\data\\ETL_data\\movies_dataset\\production_countries.parquet'

In [37]:
modified_production_countries_df.to_parquet(file_path)