#### Import the modules that we will use in our ETL

In [1]:
import numpy as np
import pandas as pd
import datetime
import ast
import locale

#### Load the dataset

In [3]:
movies_dataset = pd.read_csv("../datasets/movies_dataset.csv")
movies_dataset.drop_duplicates()
movies_dataset.head(3)

  movies_dataset = pd.read_csv("../datasets/movies_dataset.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


#### Create functions that will be used

In [4]:
def info (dataset):       # Dataset info
    
    info = dataset.info()
    return info

def fetch_name(obj):      # Flatten nested columns 1
    
    if isinstance(obj, str) and '{' in obj:
        L = []
        for i in ast.literal_eval(obj):
            L.append(i['name'])
        return L

def fetch_name_2(obj):    # Flatten nested columns 2
    if isinstance(obj, str) and '{' in obj:
        dic = ast.literal_eval(obj)
        return dic['name']

#### View dataset information

In [275]:
info(movies_dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

#### Drop unnecessary columns

In [5]:
movies_dataset.drop(columns = ["video", "imdb_id", "adult", "original_title", "vote_count", "poster_path", "homepage"], inplace = True)
info(movies_dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4494 non-null   object 
 1   budget                 45466 non-null  object 
 2   genres                 45466 non-null  object 
 3   id                     45466 non-null  object 
 4   original_language      45455 non-null  object 
 5   overview               44512 non-null  object 
 6   popularity             45461 non-null  object 
 7   production_companies   45463 non-null  object 
 8   production_countries   45463 non-null  object 
 9   release_date           45379 non-null  object 
 10  revenue                45460 non-null  float64
 11  runtime                45203 non-null  float64
 12  spoken_languages       45460 non-null  object 
 13  status                 45379 non-null  object 
 14  tagline                20412 non-null  object 
 15  ti

#### Replace revenue and budget nulls with 0

In [277]:
movies_dataset["budget"].fillna(0, inplace = True)
movies_dataset["revenue"].fillna(0, inplace = True)
info(movies_dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4494 non-null   object 
 1   budget                 45466 non-null  object 
 2   genres                 45466 non-null  object 
 3   id                     45466 non-null  object 
 4   original_language      45455 non-null  object 
 5   overview               44512 non-null  object 
 6   popularity             45461 non-null  object 
 7   production_companies   45463 non-null  object 
 8   production_countries   45463 non-null  object 
 9   release_date           45379 non-null  object 
 10  revenue                45466 non-null  float64
 11  runtime                45203 non-null  float64
 12  spoken_languages       45460 non-null  object 
 13  status                 45379 non-null  object 
 14  tagline                20412 non-null  object 
 15  ti

#### Date format AAAA-mm-dd

In [6]:
movies_dataset["release_date"].sort_values()  # Notice useless values

19730             1
29503            12
34940    1874-12-09
34937    1878-06-14
41602    1883-11-19
            ...    
45148           NaN
45203           NaN
45338           NaN
45410           NaN
45461           NaN
Name: release_date, Length: 45466, dtype: object

#### Impute those values

In [7]:
movies_dataset["release_date"].replace("1", np.nan, inplace = True)
movies_dataset["release_date"].replace("12", np.nan, inplace = True)
movies_dataset["release_date"].sort_values()

34940    1874-12-09
34937    1878-06-14
41602    1883-11-19
34933    1887-08-18
34934    1888-01-01
            ...    
45148           NaN
45203           NaN
45338           NaN
45410           NaN
45461           NaN
Name: release_date, Length: 45466, dtype: object

#### Change string dates to datetime

In [8]:
fechas_datetime = pd.to_datetime(movies_dataset["release_date"], format='%Y-%m-%d', errors = 'coerce') # errors = 'coerce' -----> If 'coerce', then invalid parsing will be set as NaT.
movies_dataset["release_date"] = fechas_datetime.dropna()
print((movies_dataset["release_date"][0]))
print(type((movies_dataset["release_date"][0])))
movies_dataset.head(3)

1995-10-30 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5


#### Drop release date nulls

In [9]:
info(movies_dataset)   # Take a look

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4494 non-null   object        
 1   budget                 45466 non-null  object        
 2   genres                 45466 non-null  object        
 3   id                     45466 non-null  object        
 4   original_language      45455 non-null  object        
 5   overview               44512 non-null  object        
 6   popularity             45461 non-null  object        
 7   production_companies   45463 non-null  object        
 8   production_countries   45463 non-null  object        
 9   release_date           45376 non-null  datetime64[ns]
 10  revenue                45460 non-null  float64       
 11  runtime                45203 non-null  float64       
 12  spoken_languages       45460 non-null  object        
 13  s

In [10]:
movies_dataset.dropna(subset = ["release_date"], inplace=True)
info(movies_dataset)   # Notice dataset change

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4488 non-null   object        
 1   budget                 45376 non-null  object        
 2   genres                 45376 non-null  object        
 3   id                     45376 non-null  object        
 4   original_language      45365 non-null  object        
 5   overview               44435 non-null  object        
 6   popularity             45376 non-null  object        
 7   production_companies   45376 non-null  object        
 8   production_countries   45376 non-null  object        
 9   release_date           45376 non-null  datetime64[ns]
 10  revenue                45376 non-null  float64       
 11  runtime                45130 non-null  float64       
 12  spoken_languages       45376 non-null  object        
 13  s

#### Create release_year column

In [11]:
movies_dataset["release_year"] = movies_dataset["release_date"].dt.year
movies_dataset["release_year"]

0        1995
1        1995
2        1995
3        1995
4        1995
         ... 
45460    1991
45462    2011
45463    2003
45464    1917
45465    2017
Name: release_year, Length: 45376, dtype: int64

#### Some esthetic

In [12]:
movies_dataset = movies_dataset.reindex(columns = ["belongs_to_collection", "budget", "genres", "id", "original_language", "overview", "popularity", "production_companies", "production_countries", "release_date", "release_year", "revenue", "runtime", "spoken_languages", "status", "tagline", "title", "vote_average"])
movies_dataset.head(3)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,release_year,revenue,runtime,spoken_languages,status,tagline,title,vote_average
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5


#### Create return column

In [13]:
movies_dataset["return"] = movies_dataset["revenue"] / movies_dataset["budget"]
movies_dataset["return"]   # Can´t divide float and string


TypeError: unsupported operand type(s) for /: 'float' and 'str'

#### Look at budget and revenue types

In [14]:
info(movies_dataset)  # budget = object(string), revenue = float64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4488 non-null   object        
 1   budget                 45376 non-null  object        
 2   genres                 45376 non-null  object        
 3   id                     45376 non-null  object        
 4   original_language      45365 non-null  object        
 5   overview               44435 non-null  object        
 6   popularity             45376 non-null  object        
 7   production_companies   45376 non-null  object        
 8   production_countries   45376 non-null  object        
 9   release_date           45376 non-null  datetime64[ns]
 10  release_year           45376 non-null  int64         
 11  revenue                45376 non-null  float64       
 12  runtime                45130 non-null  float64       
 13  s

#### Convert budget to float

In [15]:
movies_dataset["budget"] = movies_dataset["budget"].astype("float64")
info(movies_dataset)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4488 non-null   object        
 1   budget                 45376 non-null  float64       
 2   genres                 45376 non-null  object        
 3   id                     45376 non-null  object        
 4   original_language      45365 non-null  object        
 5   overview               44435 non-null  object        
 6   popularity             45376 non-null  object        
 7   production_companies   45376 non-null  object        
 8   production_countries   45376 non-null  object        
 9   release_date           45376 non-null  datetime64[ns]
 10  release_year           45376 non-null  int64         
 11  revenue                45376 non-null  float64       
 12  runtime                45130 non-null  float64       
 13  s

#### Try the division again

In [16]:
movies_dataset["return"] = movies_dataset["revenue"] / movies_dataset["budget"]
movies_dataset["return"].fillna(0, inplace = True)
movies_dataset["return"].replace(np.inf, 0, inplace = True)   # Inf = Can´t divide by 0
movies_dataset["return"].sort_values()

22713    0.000000e+00
29984    0.000000e+00
29985    0.000000e+00
29986    0.000000e+00
29987    0.000000e+00
             ...     
2308     1.018619e+06
22278    2.755584e+06
14372    4.197477e+06
3342     8.500000e+06
4000     1.239638e+07
Name: return, Length: 45376, dtype: float64

#### Take a look at the current dataset

In [17]:
movies_dataset.head(3)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,release_year,revenue,runtime,spoken_languages,status,tagline,title,vote_average,return
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,12.451801
1,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,4.043035
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,0.0


In [18]:
info(movies_dataset)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4488 non-null   object        
 1   budget                 45376 non-null  float64       
 2   genres                 45376 non-null  object        
 3   id                     45376 non-null  object        
 4   original_language      45365 non-null  object        
 5   overview               44435 non-null  object        
 6   popularity             45376 non-null  object        
 7   production_companies   45376 non-null  object        
 8   production_countries   45376 non-null  object        
 9   release_date           45376 non-null  datetime64[ns]
 10  release_year           45376 non-null  int64         
 11  revenue                45376 non-null  float64       
 12  runtime                45130 non-null  float64       
 13  s

#### Some esthetic

In [19]:
movies_dataset = movies_dataset.reindex(columns = ["belongs_to_collection", "genres", "id", "original_language", "overview", "popularity", "production_companies", "production_countries", "release_date", "release_year", "revenue", "budget", "return", "runtime", "spoken_languages", "status", "tagline", "title", "vote_average"])
movies_dataset.head(3)

Unnamed: 0,belongs_to_collection,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,release_year,revenue,budget,return,runtime,spoken_languages,status,tagline,title,vote_average
0,"{'id': 10194, 'name': 'Toy Story Collection', ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,1995,373554033.0,30000000.0,12.451801,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7
1,,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,1995,262797249.0,65000000.0,4.043035,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,1995,0.0,0.0,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5


#### Faltten nested columns using functions from the beginning

In [20]:
movies_dataset["genres"] = movies_dataset["genres"].apply(fetch_name)
movies_dataset["belongs_to_collection"] = movies_dataset["belongs_to_collection"].apply(fetch_name_2) # Use a different function based on the data type
movies_dataset["production_companies"]  = movies_dataset["production_companies"].apply(fetch_name)
movies_dataset["production_countries"]  = movies_dataset["production_countries"].apply(fetch_name)
movies_dataset["spoken_languages"] = movies_dataset["spoken_languages"].apply(fetch_name)
movies_dataset.head(3)

Unnamed: 0,belongs_to_collection,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,release_year,revenue,budget,return,runtime,spoken_languages,status,tagline,title,vote_average
0,Toy Story Collection,"[Animation, Comedy, Family]",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,1995,373554033.0,30000000.0,12.451801,81.0,[English],Released,,Toy Story,7.7
1,,"[Adventure, Fantasy, Family]",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,1995,262797249.0,65000000.0,4.043035,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,6.9
2,Grumpy Old Men Collection,"[Romance, Comedy]",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,1995,0.0,0.0,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5


#### Create month and day columns for later functions

In [21]:
locale.setlocale(locale.LC_ALL, 'es_ES.utf8')   # Translation into Spanish

movies_dataset["release_weekday"] = movies_dataset["release_date"].dt.strftime('%A').apply(lambda x: x.encode('iso-8859-1').decode('latin-1'))
movies_dataset["release_month"] = movies_dataset["release_date"].dt.strftime('%B').apply(lambda x: x.encode('iso-8859-1').decode('latin-1'))

#### Some esthetic

In [25]:
movies_dataset = movies_dataset.reindex(columns = ["id", "belongs_to_collection", "genres", "original_language", "overview", "popularity", "production_companies", "production_countries", "release_date", "release_year", "release_month", "release_weekday", "revenue", "budget", "return", "runtime", "spoken_languages", "status", "tagline", "title", "vote_average"])

#### Take a look at the clean dataset 

In [24]:
movies_dataset.head(3)

Unnamed: 0,id,belongs_to_collection,genres,original_language,overview,popularity,production_companies,production_countries,release_date,release_year,...,release_weekday,revenue,budget,return,runtime,spoken_languages,status,tagline,title,vote_average
0,862,Toy Story Collection,"[Animation, Comedy, Family]",en,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,1995,...,lunes,373554033.0,30000000.0,12.451801,81.0,[English],Released,,Toy Story,7.7
1,8844,,"[Adventure, Fantasy, Family]",en,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,1995,...,viernes,262797249.0,65000000.0,4.043035,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,6.9
2,15602,Grumpy Old Men Collection,"[Romance, Comedy]",en,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,1995,...,viernes,0.0,0.0,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5


In [26]:
info(movies_dataset)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45376 entries, 0 to 45465
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     45376 non-null  object        
 1   belongs_to_collection  4488 non-null   object        
 2   genres                 42992 non-null  object        
 3   original_language      45365 non-null  object        
 4   overview               44435 non-null  object        
 5   popularity             45376 non-null  object        
 6   production_companies   33580 non-null  object        
 7   production_countries   39165 non-null  object        
 8   release_date           45376 non-null  datetime64[ns]
 9   release_year           45376 non-null  int64         
 10  release_month          45376 non-null  object        
 11  release_weekday        45376 non-null  object        
 12  revenue                45376 non-null  float64       
 13  b

#### Save clean dataset

In [302]:
movies_dataset.to_csv("clean_movies_dataset.csv", index = False)