In [33]:
import pandas as pd
import ast
import os

In [34]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"
movies_dataset_df = pd.read_parquet(url, engine="fastparquet")

In [35]:
movies_dataset_df.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [36]:
production_companies_df = movies_dataset_df[['production_companies','id']]

In [37]:
production_companies_df = production_companies_df.rename(columns={'id': 'movie_id'})

In [38]:
production_companies_df.iloc[0]['production_companies']

"[{'name': 'Pixar Animation Studios', 'id': 3}]"

In [39]:
production_companies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   production_companies  45463 non-null  object
 1   movie_id              45466 non-null  object
dtypes: object(2)
memory usage: 710.5+ KB


In [40]:
production_companies_df = production_companies_df.dropna(subset=['production_companies'])

In [41]:
production_companies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45463 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   production_companies  45463 non-null  object
 1   movie_id              45463 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [42]:
try:
    production_companies_list_dict = [
        {**production_companies_dict, 'movie_id': row['movie_id']}
        for _, row in production_companies_df.iterrows()
        for production_companies_dict in ast.literal_eval(row['production_companies'])
    ]
except Exception as ex:
    print("Ha habido una excepción", type(ex))

Ha habido una excepción <class 'TypeError'>


In [43]:
production_companies_df = production_companies_df[~production_companies_df['production_companies'].isin(['True', 'False'])]

In [44]:
production_companies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   production_companies  45460 non-null  object
 1   movie_id              45460 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [45]:
production_companies_list_dict = [
    {**production_companies_dict, 'movie_id': row['movie_id']}
    for _, row in production_companies_df.iterrows()
    for production_companies_dict in ast.literal_eval(row['production_companies'])
]

In [46]:
production_companies_modified_df = pd.DataFrame(production_companies_list_dict)

In [47]:
production_companies_modified_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70545 entries, 0 to 70544
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      70545 non-null  object
 1   id        70545 non-null  int64 
 2   movie_id  70545 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [48]:
production_companies_modified_df['id'] = production_companies_modified_df['id'].astype('str')

In [49]:
production_companies_modified_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70545 entries, 0 to 70544
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      70545 non-null  object
 1   id        70545 non-null  object
 2   movie_id  70545 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


In [50]:
production_companies_modified_df = production_companies_modified_df.rename(columns={'name': 'company_name'})

In [51]:
current_dir = os.getcwd()
base_dir = os.path.dirname(os.path.dirname(current_dir))
file_path = os.path.join(base_dir, 'data', 'movies_dataset', 'production_companies_ETL.parquet')

In [52]:
production_companies_modified_df.to_parquet(file_path)