In [1]:
# Dependencies.
# Data.
import pandas as pd
import numpy as np

# Data Cleaning

<hr>

We have two CSV files: IMDB movies and TMDB movies.  
We need to merge them together and clean up some things about the data.
Both should be cleaned independently and then merged together as one file.

### IMDB Movies

<hr>

In [2]:
# Load in the CSV and take a look.
imdb_csv_path = "raw/imdb_top_1000.csv"

imdb_raw_df = pd.read_csv(imdb_csv_path)
display(imdb_raw_df.shape)
display(imdb_raw_df.info())
display(imdb_raw_df.head())

(1000, 16)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


None

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
# A note of things to change:
# - All columns will need to be renamed.
# - We will be dropping the following columns:
# - Certificate, Meta_score
#
# - Data types are wrong for a few columns:
# - Released_Year, Runtime, Gross
# 
# - And we have missing values for a good chunk of rows.

In [4]:
# First, let's fix existing columns.
# Release Year.
# Most values are numerical except for one instance of 'PG' which we will fix.
w_df = imdb_raw_df.copy()

# Fix the PG value for Released_Year.
# Find it.
w_df.loc[w_df.Released_Year == 'PG', :]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,PG,U,140 min,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [5]:
# Apollo 13. That was released in 1995.
w_df.loc[w_df.Released_Year == 'PG', :] = 1995
w_df.loc[w_df.Released_Year == 'PG', :]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [6]:
# Convert the column to an integer.
w_df['Released_Year'] = w_df['Released_Year'].astype('int64')
w_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 125.1+ KB


In [7]:
# Moving on to fix Runtime.
# We need to remove 'min' and strip whitespace.
# Then, convert to an integer.
w_df['Runtime'] = w_df['Runtime'].str.replace('min', '').str.strip()
w_df['Runtime'] = pd.to_numeric(w_df['Runtime'])

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Certificate    899 non-null    object 
 4   Runtime        999 non-null    float64
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(3), int64(2), object(11)
memory usage: 125.1+ KB


None

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142.0,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175.0,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [8]:
# Now, we can fix the gross column.
# Seems to have commas implanted in the numbers.
# Get rid of those and then convert to a number.
w_df['Gross'] = pd.to_numeric(w_df['Gross'].str.replace(',', ''), errors = 'coerce')

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Certificate    899 non-null    object 
 4   Runtime        999 non-null    float64
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          830 non-null    float64
dtypes: float64(4), int64(2), object(10)
memory usage: 125.1+ KB


None

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142.0,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175.0,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [9]:
# Seems that we lost a value from the Gross column in this process. But, not that worried about 1/1000 of the dataset.
# We will now be dropping the columns that we do not need.
w_df = w_df.drop(columns = ['Certificate', 'Meta_score'])
w_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Runtime        999 non-null    float64
 4   Genre          1000 non-null   object 
 5   IMDB_Rating    1000 non-null   float64
 6   Overview       1000 non-null   object 
 7   Director       1000 non-null   object 
 8   Star1          1000 non-null   object 
 9   Star2          1000 non-null   object 
 10  Star3          1000 non-null   object 
 11  Star4          1000 non-null   object 
 12  No_of_Votes    1000 non-null   int64  
 13  Gross          830 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 109.5+ KB


In [10]:
# Now, to rename columns and re-order them.
w_df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Runtime', 'Genre',
       'IMDB_Rating', 'Overview', 'Director', 'Star1', 'Star2', 'Star3',
       'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [11]:
# Rename.
cols = [
    'poster_link',
    'title',
    'release_year',
    'runtime',
    'genre',
    'rating',
    'overview',
    'director',
    'cast1',
    'cast2',
    'cast3',
    'cast4',
    'vote_count',
    'gross'
]

w_df.columns = cols
w_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   poster_link   1000 non-null   object 
 1   title         1000 non-null   object 
 2   release_year  1000 non-null   int64  
 3   runtime       999 non-null    float64
 4   genre         1000 non-null   object 
 5   rating        1000 non-null   float64
 6   overview      1000 non-null   object 
 7   director      1000 non-null   object 
 8   cast1         1000 non-null   object 
 9   cast2         1000 non-null   object 
 10  cast3         1000 non-null   object 
 11  cast4         1000 non-null   object 
 12  vote_count    1000 non-null   int64  
 13  gross         830 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 109.5+ KB


In [12]:
# Re-order.
# title, genre, release_year, runtime, rating, vote_count, director, cast, overview, gross
w_df = w_df.loc[:, ['title', 
                    'genre', 
                    'release_year', 
                    'runtime', 
                    'rating', 
                    'vote_count', 
                    'director', 
                    'cast1', 
                    'cast2', 
                    'cast3', 
                    'cast4', 
                    'overview', 
                    'gross',
                    'poster_link'
                   ]]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   genre         1000 non-null   object 
 2   release_year  1000 non-null   int64  
 3   runtime       999 non-null    float64
 4   rating        1000 non-null   float64
 5   vote_count    1000 non-null   int64  
 6   director      1000 non-null   object 
 7   cast1         1000 non-null   object 
 8   cast2         1000 non-null   object 
 9   cast3         1000 non-null   object 
 10  cast4         1000 non-null   object 
 11  overview      1000 non-null   object 
 12  gross         830 non-null    float64
 13  poster_link   1000 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 109.5+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...


In [13]:
# One last thing... add a "source" column to state that these columns are data retrieved from the IMDB dataset.
w_df['source'] = ['IMDB' for x in range(0, len(w_df.index))]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   genre         1000 non-null   object 
 2   release_year  1000 non-null   int64  
 3   runtime       999 non-null    float64
 4   rating        1000 non-null   float64
 5   vote_count    1000 non-null   int64  
 6   director      1000 non-null   object 
 7   cast1         1000 non-null   object 
 8   cast2         1000 non-null   object 
 9   cast3         1000 non-null   object 
 10  cast4         1000 non-null   object 
 11  overview      1000 non-null   object 
 12  gross         830 non-null    float64
 13  poster_link   1000 non-null   object 
 14  source        1000 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 117.3+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [14]:
# This looks about right. There are still a few missing values, but we would prefer to cover missing values once the entire dataset has been assembled.
# Save it off as the official imdb_df and move onto the TMDB one.
imdb_df = w_df.copy()

# TMDB Movies

<hr>

In [15]:
# Load in the CSV and take a look.
tmdb_csv_path = "raw/tmdb_movies.csv"

tmdb_raw_df = pd.read_csv(tmdb_csv_path)
display(tmdb_raw_df.shape)
display(tmdb_raw_df.info())
display(tmdb_raw_df.head())

(2437, 13)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2437 non-null   int64  
 1   movie_id      2437 non-null   int64  
 2   title         2437 non-null   object 
 3   genre         2437 non-null   object 
 4   release_year  2437 non-null   int64  
 5   runtime       2437 non-null   int64  
 6   rating        2437 non-null   float64
 7   vote_count    2437 non-null   int64  
 8   director      2437 non-null   object 
 9   cast          2437 non-null   object 
 10  overview      2436 non-null   object 
 11  gross         2437 non-null   int64  
 12  poster_link   2437 non-null   object 
dtypes: float64(1), int64(6), object(6)
memory usage: 247.6+ KB


None

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,0,12,Finding Nemo,"['Animation', 'Family']",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
1,1,14,American Beauty,['Drama'],1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,/wby9315QzVKdW9BonAefg8jGTTb.jpg
2,2,16,Dancer in the Dark,"['Drama', 'Crime']",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,/8Wdd3fQfbbQeoSfWpHrDfaFNhBU.jpg
3,3,18,The Fifth Element,"['Adventure', 'Fantasy', 'Action', 'Thriller',...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"['Adventure', 'Fantasy', 'Action']",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,/poHwCZeWzJCShH7tOjg8RIoyjcw.jpg


In [16]:
# We did clean this up a bit before but some things still need work.
# Column removal for Unnamed: 0 which seems to be the result of improper saving as a CSV.
# We do not need the movie_id column because it doesn't exist for the IMDB movie data and will be irrelevant.
# 
# We need to clean the genre column up: removal of [, ], and '.
#
# We need to attach the poster_link endpoints to the proper base URL.
#
# We need to break the cast column into cast1-cast4.

In [17]:
# Make a working copy.
w_df = tmdb_raw_df.copy()

In [18]:
# First, we want to clean up the genre column.
# We need to remove all occurences of [ ] and '
w_df['genre'] = w_df['genre'].str.replace('[', '').str.replace(']', '').str.replace('\'', '')
w_df.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,0,12,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
1,1,14,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,/wby9315QzVKdW9BonAefg8jGTTb.jpg
2,2,16,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,/8Wdd3fQfbbQeoSfWpHrDfaFNhBU.jpg
3,3,18,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,/poHwCZeWzJCShH7tOjg8RIoyjcw.jpg


In [19]:
# Let's add the base url to all the poster links.
base = 'https://image.tmdb.org/t/p/w500'

w_df['poster_link'] = [base + x for x in w_df['poster_link']]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2437 non-null   int64  
 1   movie_id      2437 non-null   int64  
 2   title         2437 non-null   object 
 3   genre         2437 non-null   object 
 4   release_year  2437 non-null   int64  
 5   runtime       2437 non-null   int64  
 6   rating        2437 non-null   float64
 7   vote_count    2437 non-null   int64  
 8   director      2437 non-null   object 
 9   cast          2437 non-null   object 
 10  overview      2436 non-null   object 
 11  gross         2437 non-null   int64  
 12  poster_link   2437 non-null   object 
dtypes: float64(1), int64(6), object(6)
memory usage: 247.6+ KB


None

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,0,12,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...
1,1,14,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...
2,2,16,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...
3,3,18,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...


In [20]:
# Split the cast column into four separate columns.
# Clean up the columns afterwards.
w_df[['cast1', 'cast2', 'cast3', 'cast4']] = w_df['cast'].str.replace('[', '').str.replace(']', '').str.split(',', expand = True)
w_df[['cast1', 'cast2', 'cast3', 'cast4']] = w_df[['cast1', 'cast2', 'cast3', 'cast4']].apply(lambda x: x.str.strip('\'" '))

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2437 non-null   int64  
 1   movie_id      2437 non-null   int64  
 2   title         2437 non-null   object 
 3   genre         2437 non-null   object 
 4   release_year  2437 non-null   int64  
 5   runtime       2437 non-null   int64  
 6   rating        2437 non-null   float64
 7   vote_count    2437 non-null   int64  
 8   director      2437 non-null   object 
 9   cast          2437 non-null   object 
 10  overview      2436 non-null   object 
 11  gross         2437 non-null   int64  
 12  poster_link   2437 non-null   object 
 13  cast1         2437 non-null   object 
 14  cast2         2424 non-null   object 
 15  cast3         2416 non-null   object 
 16  cast4         2415 non-null   object 
dtypes: float64(1), int64(6), object(10)
memory usage: 323.8+ KB


None

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link,cast1,cast2,cast3,cast4
0,0,12,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe
1,1,14,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley
2,2,16,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...,Björk,Catherine Deneuve,David Morse,Peter Stormare
3,3,18,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush


In [21]:
# Time to drop columns and re-organize the DataFrame.
w_df = w_df.drop(columns = ['Unnamed: 0', 'movie_id', 'cast'])

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2437 non-null   object 
 1   genre         2437 non-null   object 
 2   release_year  2437 non-null   int64  
 3   runtime       2437 non-null   int64  
 4   rating        2437 non-null   float64
 5   vote_count    2437 non-null   int64  
 6   director      2437 non-null   object 
 7   overview      2436 non-null   object 
 8   gross         2437 non-null   int64  
 9   poster_link   2437 non-null   object 
 10  cast1         2437 non-null   object 
 11  cast2         2424 non-null   object 
 12  cast3         2416 non-null   object 
 13  cast4         2415 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 266.7+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,overview,gross,poster_link,cast1,cast2,cast3,cast4
0,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe
1,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley
2,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...,Björk,Catherine Deneuve,David Morse,Peter Stormare
3,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm
4,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush


In [22]:
# Re-order.
# title, genre, release_year, runtime, rating, vote_count, director, cast, overview, gross
w_df = w_df.loc[:, ['title', 
                    'genre', 
                    'release_year', 
                    'runtime', 
                    'rating', 
                    'vote_count', 
                    'director', 
                    'cast1', 
                    'cast2', 
                    'cast3', 
                    'cast4', 
                    'overview', 
                    'gross',
                    'poster_link'
                   ]]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2437 non-null   object 
 1   genre         2437 non-null   object 
 2   release_year  2437 non-null   int64  
 3   runtime       2437 non-null   int64  
 4   rating        2437 non-null   float64
 5   vote_count    2437 non-null   int64  
 6   director      2437 non-null   object 
 7   cast1         2437 non-null   object 
 8   cast2         2424 non-null   object 
 9   cast3         2416 non-null   object 
 10  cast4         2415 non-null   object 
 11  overview      2436 non-null   object 
 12  gross         2437 non-null   int64  
 13  poster_link   2437 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 266.7+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link
0,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,"Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...
1,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,"Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...
2,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,Björk,Catherine Deneuve,David Morse,Peter Stormare,"Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...
3,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm,"In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...
4,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush,After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...


In [23]:
# One last thing... add a "source" column to state that these columns are data retrieved from the TMDB dataset.
w_df['source'] = ['TMDB' for x in range(0, len(w_df.index))]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2437 non-null   object 
 1   genre         2437 non-null   object 
 2   release_year  2437 non-null   int64  
 3   runtime       2437 non-null   int64  
 4   rating        2437 non-null   float64
 5   vote_count    2437 non-null   int64  
 6   director      2437 non-null   object 
 7   cast1         2437 non-null   object 
 8   cast2         2424 non-null   object 
 9   cast3         2416 non-null   object 
 10  cast4         2415 non-null   object 
 11  overview      2436 non-null   object 
 12  gross         2437 non-null   int64  
 13  poster_link   2437 non-null   object 
 14  source        2437 non-null   object 
dtypes: float64(1), int64(4), object(10)
memory usage: 285.7+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,"Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...,TMDB
1,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,"Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...,TMDB
2,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,Björk,Catherine Deneuve,David Morse,Peter Stormare,"Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...,TMDB
3,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm,"In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...,TMDB
4,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush,After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...,TMDB


In [24]:
# Looks good, save it off as the tmdb_df.
tmdb_df = w_df.copy()

In [25]:
# This is looking good. Time to add the two DataFrames together.
concat_df = pd.concat([imdb_df, tmdb_df])

display(concat_df.info())
display(concat_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3437 entries, 0 to 2436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3437 non-null   object 
 1   genre         3437 non-null   object 
 2   release_year  3437 non-null   int64  
 3   runtime       3436 non-null   float64
 4   rating        3437 non-null   float64
 5   vote_count    3437 non-null   int64  
 6   director      3437 non-null   object 
 7   cast1         3437 non-null   object 
 8   cast2         3424 non-null   object 
 9   cast3         3416 non-null   object 
 10  cast4         3415 non-null   object 
 11  overview      3436 non-null   object 
 12  gross         3267 non-null   float64
 13  poster_link   3437 non-null   object 
 14  source        3437 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 429.6+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [26]:
# Let's fill in some of the data that is still missing.
# We have a lot missing from gross.
# And 1 row is missing the runtime.
# First, we need to find the best way to impute the values.
concat_df[['runtime', 'gross']].describe()

Unnamed: 0,runtime,gross
count,3436.0,3267.0
mean,110.268335,86006610.0
std,28.996042,176018900.0
min,0.0,-400000000.0
25%,94.0,0.0
50%,107.0,18500000.0
75%,125.0,100519200.0
max,400.0,1860250000.0


In [27]:
# Looks like median is best for both.
concat_df.loc[pd.isnull(concat_df['runtime']), 'runtime'] = concat_df['runtime'].median()
concat_df.loc[pd.isnull(concat_df['gross']), 'gross'] = concat_df['gross'].median()

display(concat_df.info())
display(concat_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3437 entries, 0 to 2436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3437 non-null   object 
 1   genre         3437 non-null   object 
 2   release_year  3437 non-null   int64  
 3   runtime       3437 non-null   float64
 4   rating        3437 non-null   float64
 5   vote_count    3437 non-null   int64  
 6   director      3437 non-null   object 
 7   cast1         3437 non-null   object 
 8   cast2         3424 non-null   object 
 9   cast3         3416 non-null   object 
 10  cast4         3415 non-null   object 
 11  overview      3436 non-null   object 
 12  gross         3437 non-null   float64
 13  poster_link   3437 non-null   object 
 14  source        3437 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 429.6+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [30]:
# There are still quite a few missing cast members but that is to be expected.
# Some movies do not have the vast amount of starring cast as other movies.

# There is one missing value for overview. Let's look at it.
concat_df.loc[pd.isnull(concat_df['overview']), :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
2372,Namibia: A Change of Perspective,Documentary,2024,0.0,0.0,0,Anton Stamenkov,Anton Stamenkov,Radoslav Stamenkov,Stefan Stefanov,elephant,,0.0,https://image.tmdb.org/t/p/w500/8FbnBKUQpF0Fmm...,TMDB


In [32]:
# It has no runtime, rating, votes, or gross. Let's drop that.
concat_df.drop(2372, inplace = True)

display(concat_df.info())
display(concat_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3436 entries, 0 to 2436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3436 non-null   object 
 1   genre         3436 non-null   object 
 2   release_year  3436 non-null   int64  
 3   runtime       3436 non-null   float64
 4   rating        3436 non-null   float64
 5   vote_count    3436 non-null   int64  
 6   director      3436 non-null   object 
 7   cast1         3436 non-null   object 
 8   cast2         3423 non-null   object 
 9   cast3         3415 non-null   object 
 10  cast4         3414 non-null   object 
 11  overview      3436 non-null   object 
 12  gross         3436 non-null   float64
 13  poster_link   3436 non-null   object 
 14  source        3436 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 429.5+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [None]:
# All the datatypes are correct and we have no (important) missing values. Save it off.
filepath = "clean/movie_dataset.csv"

