### ETL del archivo en crudo `movies_dataset.parquet` con las columnas de datos simples. El nuevo dataset se va a llamar `movies`

In [1]:
import pandas as pd
import os

In [2]:
url = "https://github.com/FranciscoHugoLezik/Movies_data/blob/main/movies_dataset.parquet?raw=true"

movies_dataset = pd.read_parquet(url, 
                                 engine="fastparquet")

movies_dataset.iloc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [3]:
unnecessary = ["video", 
               "imdb_id", 
               "adult", 
               "original_title", 
               "poster_path", 
               "homepage"]

movies_dataset = (movies_dataset
                  .drop(columns=unnecessary)
                  .copy())

movies_dataset.iloc[0]

belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
id                                                                     862
original_language                                                       en
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...
release_date                                                    1995-10-30
revenue                                                          373554033
runtime                                                               81.0
spoken_languages                  [{'iso_639_1': 'en', 'name': 'English'}]
status                   

In [4]:
nested_tables = ["belongs_to_collection", 
                 "genres", 
                 "production_companies", 
                 "production_countries", 
                 "spoken_languages"]

simple_data = (movies_dataset
               .drop(columns=nested_tables))

simple_data.iloc[0]

budget                                                        30000000
id                                                                 862
original_language                                                   en
overview             Led by Woody, Andy's toys live happily in his ...
popularity                                                   21.946943
release_date                                                1995-10-30
revenue                                                      373554033
runtime                                                           81.0
status                                                        Released
tagline                                                           None
title                                                        Toy Story
vote_average                                                       7.7
vote_count                                                        5415
Name: 0, dtype: object

In [5]:
simple_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   budget             45466 non-null  object
 1   id                 45466 non-null  object
 2   original_language  45455 non-null  object
 3   overview           44512 non-null  object
 4   popularity         45461 non-null  object
 5   release_date       45379 non-null  object
 6   revenue            45460 non-null  object
 7   runtime            45203 non-null  object
 8   status             45379 non-null  object
 9   tagline            20412 non-null  object
 10  title              45460 non-null  object
 11  vote_average       45460 non-null  object
 12  vote_count         45460 non-null  object
dtypes: object(13)
memory usage: 4.5+ MB


In [6]:
simple_data.dropna(subset='title', 
                   inplace=True)

simple_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45460 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   budget             45460 non-null  object
 1   id                 45460 non-null  object
 2   original_language  45449 non-null  object
 3   overview           44506 non-null  object
 4   popularity         45460 non-null  object
 5   release_date       45376 non-null  object
 6   revenue            45460 non-null  object
 7   runtime            45203 non-null  object
 8   status             45379 non-null  object
 9   tagline            20412 non-null  object
 10  title              45460 non-null  object
 11  vote_average       45460 non-null  object
 12  vote_count         45460 non-null  object
dtypes: object(13)
memory usage: 4.9+ MB


In [7]:
simple_data['budget'] = (simple_data['budget']
                         .astype('int64'))

simple_data['popularity'] = (simple_data['popularity']
                             .astype('float64'))

simple_data['release_date'] = (simple_data['release_date']
                               .astype('datetime64[ns]'))

simple_data['revenue'] = (simple_data['revenue']
                          .astype('int64'))

simple_data['runtime'] = (simple_data['runtime']
                          .astype('float64'))

simple_data['vote_average'] = (simple_data['vote_average']
                               .astype('float64'))

simple_data['vote_count'] = (simple_data['vote_count']
                             .astype('int64'))

In [8]:
simple_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45460 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45460 non-null  int64         
 1   id                 45460 non-null  object        
 2   original_language  45449 non-null  object        
 3   overview           44506 non-null  object        
 4   popularity         45460 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45460 non-null  int64         
 7   runtime            45203 non-null  float64       
 8   status             45379 non-null  object        
 9   tagline            20412 non-null  object        
 10  title              45460 non-null  object        
 11  vote_average       45460 non-null  float64       
 12  vote_count         45460 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory 

In [9]:
simple_data.dropna(subset=['release_date'], 
                   inplace=True)

simple_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory 

In [10]:
simple_data['revenue'] = (simple_data['revenue']
                          .fillna(0))

simple_data['budget'] = (simple_data['budget']
                         .fillna(0))

simple_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory 

In [11]:
simple_data['release_year'] = (simple_data['release_date']
                               .dt.year)
simple_data['release_year'] = (simple_data['release_year']
                               .astype(str))

simple_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
 13  release_year       45376 non-null  object        
dtypes: dat

In [12]:
return_movie = (
    lambda row: row['revenue'] / row['budget'] 
    if row['revenue'] > 0 and row['budget'] > 0 
    else 0
)

simple_data['return'] = (simple_data
                         .apply(return_movie, 
                                axis=1))
simple_data['return'] = (simple_data['return']
                         .round(2))

simple_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
 13  release_year       45376 non-null  object        
 14  return

In [13]:
simple_data.reset_index(drop=True, 
                        inplace=True)

simple_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45376 entries, 0 to 45375
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  int64         
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  float64       
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  int64         
 7   runtime            45130 non-null  float64       
 8   status             45296 non-null  object        
 9   tagline            20398 non-null  object        
 10  title              45376 non-null  object        
 11  vote_average       45376 non-null  float64       
 12  vote_count         45376 non-null  int64         
 13  release_year       45376 non-null  object        
 14  return

In [14]:
simple_data.rename(columns={'id': 'movie_id'}, 
                   inplace=True)

simple_data.iloc[0]

budget                                                        30000000
movie_id                                                           862
original_language                                                   en
overview             Led by Woody, Andy's toys live happily in his ...
popularity                                                   21.946943
release_date                                       1995-10-30 00:00:00
revenue                                                      373554033
runtime                                                           81.0
status                                                        Released
tagline                                                           None
title                                                        Toy Story
vote_average                                                       7.7
vote_count                                                        5415
release_year                                                      1995
return

In [15]:
current_dir = os.getcwd()

current_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\notebooks\\ETL\\movies_dataset'

In [16]:
base_dir = os.path.dirname(
    os.path.dirname(
        os.path.dirname(current_dir)))

base_dir

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas'

In [17]:
file_path = os.path.join(base_dir, 
                         'data', 
                         'ETL_data', 
                         'movies_dataset', 
                         'simple_data.parquet')

file_path

'c:\\Users\\franc\\Desktop\\Proyecto_Peliculas\\data\\ETL_data\\movies_dataset\\simple_data.parquet'

In [18]:
simple_data.to_parquet(file_path)