### ETL del archivo en crudo `movies_dataset.parquet` con las columnas de datos simples. El nuevo dataset se va a llamar `movies`

In [18]:
import pandas as pd
import os

In [19]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"

movies_dataset_df = pd.read_parquet(url, engine="fastparquet")

movies_dataset_df.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [20]:
unnecessary = ["video", 
               "imdb_id", 
               "adult", 
               "original_title", 
               "poster_path", 
               "homepage"]

movies_df = movies_dataset_df.drop(columns=unnecessary).copy()

movies_df.iloc[0]

belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
id                                                                     862
original_language                                                       en
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...
release_date                                                    1995-10-30
revenue                                                          373554033
runtime                                                               81.0
spoken_languages                  [{'iso_639_1': 'en', 'name': 'English'}]
status                   

In [21]:
nested_tables = ["belongs_to_collection", 
                 "genres", 
                 "production_companies", 
                 "production_countries", 
                 "spoken_languages"]

movies_df = movies_df.drop(columns=nested_tables)

movies_df.iloc[0]

budget                                                        30000000
id                                                                 862
original_language                                                   en
overview             Led by Woody, Andy's toys live happily in his ...
popularity                                                   21.946943
release_date                                                1995-10-30
revenue                                                      373554033
runtime                                                           81.0
status                                                        Released
tagline                                                           None
title                                                        Toy Story
vote_average                                                       7.7
vote_count                                                        5415
Name: 0, dtype: object

In [22]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   budget             45466 non-null  object
 1   id                 45466 non-null  object
 2   original_language  45455 non-null  object
 3   overview           44512 non-null  object
 4   popularity         45461 non-null  object
 5   release_date       45379 non-null  object
 6   revenue            45460 non-null  object
 7   runtime            45203 non-null  object
 8   status             45379 non-null  object
 9   tagline            20412 non-null  object
 10  title              45460 non-null  object
 11  vote_average       45460 non-null  object
 12  vote_count         45460 non-null  object
dtypes: object(13)
memory usage: 4.5+ MB


In [23]:
movies_df = movies_df.dropna(subset='title')

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   budget             45460 non-null  object
 1   id                 45460 non-null  object
 2   original_language  45449 non-null  object
 3   overview           44506 non-null  object
 4   popularity         45460 non-null  object
 5   release_date       45376 non-null  object
 6   revenue            45460 non-null  object
 7   runtime            45203 non-null  object
 8   status             45379 non-null  object
 9   tagline            20412 non-null  object
 10  title              45460 non-null  object
 11  vote_average       45460 non-null  object
 12  vote_count         45460 non-null  object
dtypes: object(13)
memory usage: 4.9+ MB


In [24]:
movies_df['budget'] = movies_df['budget'].astype('int64')
movies_df['popularity'] = movies_df['popularity'].astype('float64')
movies_df['release_date'] = movies_df['release_date'].astype('datetime64[ns]')
movies_df['revenue'] = movies_df['revenue'].astype('int64')
movies_df['runtime'] = movies_df['runtime'].astype('float64')
movies_df['vote_average'] = movies_df['vote_average'].astype('float64')
movies_df['vote_count'] = movies_df['vote_count'].astype('int64')

In [25]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45460 non-null  int64         
 1   id                 45460 non-null  object        
 2   original_language  45449 non-null  object        
 3   overview           44506 non-null  object        
 4   popularity         45460 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45460 non-null  int64         
 7   runtime            45203 non-null  float64       
 8   status             45379 non-null  object        
 9   tagline            20412 non-null  object        
 10  title              45460 non-null  object        
 11  vote_average       45460 non-null  float64       
 12  vote_count         45460 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory usage

In [26]:
movies_df.dropna(subset=['release_date'], inplace=True)

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45376 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory usage

In [27]:
movies_df['revenue'] = movies_df['revenue'].fillna(0)
movies_df['budget'] = movies_df['budget'].fillna(0)

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45376 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory usage

In [28]:
movies_df['release_year'] = movies_df['release_date'].dt.year
movies_df['release_year'] = movies_df['release_year'].astype(str)

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45376 entries, 0 to 45465
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
 13  release_year       45376 non-null  object        
dtypes: datetime

In [29]:
return_movie = (
    lambda row: row['revenue'] / row['budget'] 
    if row['revenue'] > 0 and row['budget'] > 0 
    else 0
)

movies_df['return'] = movies_df.apply(return_movie, axis=1)
movies_df['return'] = round(movies_df['return'], 2)

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45376 entries, 0 to 45465
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
 13  release_year       45376 non-null  object        
 14  return     

In [30]:
movies_df.reset_index(drop=True, inplace=True)

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45376 entries, 0 to 45375
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
 13  release_year       45376 non-null  object        
 14  return

In [31]:
movies_df.rename(columns={'id': 'movie_id'}, 
                 inplace=True)

movies_df.iloc[0]

budget                                                        30000000
movie_id                                                           862
original_language                                                   en
overview             Led by Woody, Andy's toys live happily in his ...
popularity                                                   21.946943
release_date                                       1995-10-30 00:00:00
revenue                                                      373554033
runtime                                                           81.0
status                                                        Released
tagline                                                           None
title                                                        Toy Story
vote_average                                                       7.7
vote_count                                                        5415
release_year                                                      1995
return

In [32]:
current_dir = os.getcwd()

current_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\notebooks\\ETL\\movies_dataset'

In [33]:
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))

base_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas'

In [34]:
file_path = os.path.join(base_dir, 'data', 'ETL_data', 'movies_dataset', 'movies.parquet')

file_path

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\data\\ETL_data\\movies_dataset\\movies.parquet'

In [35]:
movies_df.to_parquet(file_path)