### ETL del archivo en crudo `movies_dataset.parquet` de la columna `spoken_languages`

In [1]:
import pandas as pd
import ast
import os

In [2]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"

movies_dataset = pd.read_parquet(url, 
                                 engine="fastparquet")

movies_dataset.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [3]:
spoken_languages = movies_dataset[['spoken_languages', 
                                   'id']].copy()

spoken_languages.iloc[0]

spoken_languages    [{'iso_639_1': 'en', 'name': 'English'}]
id                                                       862
Name: 0, dtype: object

In [4]:
spoken_languages.rename(columns={'id': 'movie_id'}, 
                        inplace=True)

spoken_languages.iloc[0]

spoken_languages    [{'iso_639_1': 'en', 'name': 'English'}]
movie_id                                                 862
Name: 0, dtype: object

In [5]:
spoken_languages.iloc[0]['spoken_languages']

"[{'iso_639_1': 'en', 'name': 'English'}]"

In [6]:
spoken_languages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   spoken_languages  45460 non-null  object
 1   movie_id          45466 non-null  object
dtypes: object(2)
memory usage: 710.5+ KB


In [7]:
spoken_languages.dropna(subset=['spoken_languages'], 
                        inplace=True)

spoken_languages.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   spoken_languages  45460 non-null  object
 1   movie_id          45460 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [8]:
list_dict = [
    {**language, 'movie_id': row['movie_id']}
    for _, row in spoken_languages.iterrows()
    for language in ast.literal_eval(row['spoken_languages'])
]

In [9]:
spoken_languages = pd.DataFrame(list_dict)

spoken_languages.iloc[0]

iso_639_1         en
name         English
movie_id         862
Name: 0, dtype: object

In [10]:
spoken_languages.rename(columns={'name': 'spoken_language'}, 
                        inplace=True)

spoken_languages.iloc[0]

iso_639_1               en
spoken_language    English
movie_id               862
Name: 0, dtype: object

In [11]:
spoken_languages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53300 entries, 0 to 53299
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   iso_639_1        53300 non-null  object
 1   spoken_language  53300 non-null  object
 2   movie_id         53300 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [12]:
current_dir = os.getcwd()

current_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\notebooks\\ETL\\movies_dataset\\nested_data'

In [13]:
base_dir = os.path.dirname(
    os.path.dirname(
        os.path.dirname(
            os.path.dirname(current_dir))))

base_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas'

In [14]:
file_path = os.path.join(base_dir, 
                         'data', 
                         'ETL_data', 
                         'movies_dataset', 
                         'previous_nested_data', 
                         'spoken_languages.parquet')

file_path

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\data\\ETL_data\\movies_dataset\\previous_nested_data\\spoken_languages.parquet'

In [15]:
spoken_languages.to_parquet(file_path)