In [1]:
# Dependencies.
# Data.
import pandas as pd
import numpy as np

# Data Cleaning

<hr>

We have two CSV files: IMDB movies and TMDB movies.  
We need to merge them together and clean up some things about the data.
Both should be cleaned independently and then merged together as one file.

### IMDB Movies

<hr>

In [2]:
# Load in the CSV and take a look.
imdb_csv_path = "raw/imdb_top_1000.csv"

imdb_raw_df = pd.read_csv(imdb_csv_path)
display(imdb_raw_df.shape)
display(imdb_raw_df.info())
display(imdb_raw_df.head())

(1000, 16)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


None

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
# A note of things to change:
# - All columns will need to be renamed.
# - We will be dropping the following columns:
# - Certificate, Meta_score
#
# - Data types are wrong for a few columns:
# - Released_Year, Runtime, Gross
# 
# - And we have missing values for a good chunk of rows.

In [4]:
# First, let's fix existing columns.
# Release Year.
# Most values are numerical except for one instance of 'PG' which we will fix.
w_df = imdb_raw_df.copy()

# Fix the PG value for Released_Year.
# Find it.
w_df.loc[w_df.Released_Year == 'PG', :]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,PG,U,140 min,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [5]:
# Apollo 13. That was released in 1995.
w_df.loc[w_df.Released_Year == 'PG', :] = 1995
w_df.loc[w_df.Released_Year == 'PG', :]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [6]:
# Convert the column to an integer.
w_df['Released_Year'] = w_df['Released_Year'].astype('int64')
w_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 125.1+ KB


In [7]:
# Moving on to fix Runtime.
# We need to remove 'min' and strip whitespace.
# Then, convert to an integer.
w_df['Runtime'] = w_df['Runtime'].str.replace('min', '').str.strip()
w_df['Runtime'] = pd.to_numeric(w_df['Runtime'])

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Certificate    899 non-null    object 
 4   Runtime        999 non-null    float64
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(3), int64(2), object(11)
memory usage: 125.1+ KB


None

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142.0,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175.0,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [8]:
# Now, we can fix the gross column.
# Seems to have commas implanted in the numbers.
# Get rid of those and then convert to a number.
w_df['Gross'] = pd.to_numeric(w_df['Gross'].str.replace(',', ''), errors = 'coerce')

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Certificate    899 non-null    object 
 4   Runtime        999 non-null    float64
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          830 non-null    float64
dtypes: float64(4), int64(2), object(10)
memory usage: 125.1+ KB


None

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142.0,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175.0,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [9]:
# Seems that we lost a value from the Gross column in this process. But, not that worried about 1/1000 of the dataset.
# We will now be dropping the columns that we do not need.
w_df = w_df.drop(columns = ['Certificate', 'Meta_score'])
w_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   int64  
 3   Runtime        999 non-null    float64
 4   Genre          1000 non-null   object 
 5   IMDB_Rating    1000 non-null   float64
 6   Overview       1000 non-null   object 
 7   Director       1000 non-null   object 
 8   Star1          1000 non-null   object 
 9   Star2          1000 non-null   object 
 10  Star3          1000 non-null   object 
 11  Star4          1000 non-null   object 
 12  No_of_Votes    1000 non-null   int64  
 13  Gross          830 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 109.5+ KB


In [10]:
# Now, to rename columns and re-order them.
w_df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Runtime', 'Genre',
       'IMDB_Rating', 'Overview', 'Director', 'Star1', 'Star2', 'Star3',
       'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [11]:
# Rename.
cols = [
    'poster_link',
    'title',
    'release_year',
    'runtime',
    'genre',
    'rating',
    'overview',
    'director',
    'cast1',
    'cast2',
    'cast3',
    'cast4',
    'vote_count',
    'gross'
]

w_df.columns = cols
w_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   poster_link   1000 non-null   object 
 1   title         1000 non-null   object 
 2   release_year  1000 non-null   int64  
 3   runtime       999 non-null    float64
 4   genre         1000 non-null   object 
 5   rating        1000 non-null   float64
 6   overview      1000 non-null   object 
 7   director      1000 non-null   object 
 8   cast1         1000 non-null   object 
 9   cast2         1000 non-null   object 
 10  cast3         1000 non-null   object 
 11  cast4         1000 non-null   object 
 12  vote_count    1000 non-null   int64  
 13  gross         830 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 109.5+ KB


In [12]:
# Re-order.
# title, genre, release_year, runtime, rating, vote_count, director, cast, overview, gross
w_df = w_df.loc[:, ['title', 
                    'genre', 
                    'release_year', 
                    'runtime', 
                    'rating', 
                    'vote_count', 
                    'director', 
                    'cast1', 
                    'cast2', 
                    'cast3', 
                    'cast4', 
                    'overview', 
                    'gross',
                    'poster_link'
                   ]]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   genre         1000 non-null   object 
 2   release_year  1000 non-null   int64  
 3   runtime       999 non-null    float64
 4   rating        1000 non-null   float64
 5   vote_count    1000 non-null   int64  
 6   director      1000 non-null   object 
 7   cast1         1000 non-null   object 
 8   cast2         1000 non-null   object 
 9   cast3         1000 non-null   object 
 10  cast4         1000 non-null   object 
 11  overview      1000 non-null   object 
 12  gross         830 non-null    float64
 13  poster_link   1000 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 109.5+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...


In [13]:
# One last thing... add a "source" column to state that these columns are data retrieved from the IMDB dataset.
w_df['source'] = ['IMDB' for x in range(0, len(w_df.index))]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   genre         1000 non-null   object 
 2   release_year  1000 non-null   int64  
 3   runtime       999 non-null    float64
 4   rating        1000 non-null   float64
 5   vote_count    1000 non-null   int64  
 6   director      1000 non-null   object 
 7   cast1         1000 non-null   object 
 8   cast2         1000 non-null   object 
 9   cast3         1000 non-null   object 
 10  cast4         1000 non-null   object 
 11  overview      1000 non-null   object 
 12  gross         830 non-null    float64
 13  poster_link   1000 non-null   object 
 14  source        1000 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 117.3+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [14]:
# This looks about right. There are still a few missing values, but we would prefer to cover missing values once the entire dataset has been assembled.
# Save it off as the official imdb_df and move onto the TMDB one.
imdb_df = w_df.copy()

# TMDB Movies

<hr>

In [15]:
# Load in the CSV and take a look.
tmdb_csv_path = "raw/tmdb_movies.csv"

tmdb_raw_df = pd.read_csv(tmdb_csv_path)
display(tmdb_raw_df.shape)
display(tmdb_raw_df.info())
display(tmdb_raw_df.head())

(2437, 13)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2437 non-null   int64  
 1   movie_id      2437 non-null   int64  
 2   title         2437 non-null   object 
 3   genre         2437 non-null   object 
 4   release_year  2437 non-null   int64  
 5   runtime       2437 non-null   int64  
 6   rating        2437 non-null   float64
 7   vote_count    2437 non-null   int64  
 8   director      2437 non-null   object 
 9   cast          2437 non-null   object 
 10  overview      2436 non-null   object 
 11  gross         2437 non-null   int64  
 12  poster_link   2437 non-null   object 
dtypes: float64(1), int64(6), object(6)
memory usage: 247.6+ KB


None

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,0,12,Finding Nemo,"['Animation', 'Family']",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
1,1,14,American Beauty,['Drama'],1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,/wby9315QzVKdW9BonAefg8jGTTb.jpg
2,2,16,Dancer in the Dark,"['Drama', 'Crime']",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,/8Wdd3fQfbbQeoSfWpHrDfaFNhBU.jpg
3,3,18,The Fifth Element,"['Adventure', 'Fantasy', 'Action', 'Thriller',...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"['Adventure', 'Fantasy', 'Action']",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,/poHwCZeWzJCShH7tOjg8RIoyjcw.jpg


In [16]:
# We did clean this up a bit before but some things still need work.
# Column removal for Unnamed: 0 which seems to be the result of improper saving as a CSV.
# We do not need the movie_id column because it doesn't exist for the IMDB movie data and will be irrelevant.
# 
# We need to clean the genre column up: removal of [, ], and '.
#
# We need to attach the poster_link endpoints to the proper base URL.
#
# We need to break the cast column into cast1-cast4.

In [17]:
# Make a working copy.
w_df = tmdb_raw_df.copy()

In [18]:
# First, we want to clean up the genre column.
# We need to remove all occurences of [ ] and '
w_df['genre'] = w_df['genre'].str.replace('[', '').str.replace(']', '').str.replace('\'', '')
w_df.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,0,12,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
1,1,14,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,/wby9315QzVKdW9BonAefg8jGTTb.jpg
2,2,16,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,/8Wdd3fQfbbQeoSfWpHrDfaFNhBU.jpg
3,3,18,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,/poHwCZeWzJCShH7tOjg8RIoyjcw.jpg


In [19]:
# Let's add the base url to all the poster links.
base = 'https://image.tmdb.org/t/p/w500'

w_df['poster_link'] = [base + x for x in w_df['poster_link']]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2437 non-null   int64  
 1   movie_id      2437 non-null   int64  
 2   title         2437 non-null   object 
 3   genre         2437 non-null   object 
 4   release_year  2437 non-null   int64  
 5   runtime       2437 non-null   int64  
 6   rating        2437 non-null   float64
 7   vote_count    2437 non-null   int64  
 8   director      2437 non-null   object 
 9   cast          2437 non-null   object 
 10  overview      2436 non-null   object 
 11  gross         2437 non-null   int64  
 12  poster_link   2437 non-null   object 
dtypes: float64(1), int64(6), object(6)
memory usage: 247.6+ KB


None

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,0,12,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...
1,1,14,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...
2,2,16,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...
3,3,18,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...


In [20]:
# Split the cast column into four separate columns.
# Clean up the columns afterwards.
w_df[['cast1', 'cast2', 'cast3', 'cast4']] = w_df['cast'].str.replace('[', '').str.replace(']', '').str.split(',', expand = True)
w_df[['cast1', 'cast2', 'cast3', 'cast4']] = w_df[['cast1', 'cast2', 'cast3', 'cast4']].apply(lambda x: x.str.strip('\'" '))

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2437 non-null   int64  
 1   movie_id      2437 non-null   int64  
 2   title         2437 non-null   object 
 3   genre         2437 non-null   object 
 4   release_year  2437 non-null   int64  
 5   runtime       2437 non-null   int64  
 6   rating        2437 non-null   float64
 7   vote_count    2437 non-null   int64  
 8   director      2437 non-null   object 
 9   cast          2437 non-null   object 
 10  overview      2436 non-null   object 
 11  gross         2437 non-null   int64  
 12  poster_link   2437 non-null   object 
 13  cast1         2437 non-null   object 
 14  cast2         2424 non-null   object 
 15  cast3         2416 non-null   object 
 16  cast4         2415 non-null   object 
dtypes: float64(1), int64(6), object(10)
memory usage: 323.8+ KB


None

Unnamed: 0.1,Unnamed: 0,movie_id,title,genre,release_year,runtime,rating,vote_count,director,cast,overview,gross,poster_link,cast1,cast2,cast3,cast4
0,0,12,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"['Albert Brooks', 'Ellen DeGeneres', 'Alexande...","Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe
1,1,14,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...","Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley
2,2,16,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"['Björk', 'Catherine Deneuve', 'David Morse', ...","Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...,Björk,Catherine Deneuve,David Morse,Peter Stormare
3,3,18,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"['Bruce Willis', 'Milla Jovovich', 'Gary Oldma...","In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm
4,4,22,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush


In [21]:
# Time to drop columns and re-organize the DataFrame.
w_df = w_df.drop(columns = ['Unnamed: 0', 'movie_id', 'cast'])

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2437 non-null   object 
 1   genre         2437 non-null   object 
 2   release_year  2437 non-null   int64  
 3   runtime       2437 non-null   int64  
 4   rating        2437 non-null   float64
 5   vote_count    2437 non-null   int64  
 6   director      2437 non-null   object 
 7   overview      2436 non-null   object 
 8   gross         2437 non-null   int64  
 9   poster_link   2437 non-null   object 
 10  cast1         2437 non-null   object 
 11  cast2         2424 non-null   object 
 12  cast3         2416 non-null   object 
 13  cast4         2415 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 266.7+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,overview,gross,poster_link,cast1,cast2,cast3,cast4
0,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,"Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe
1,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,"Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley
2,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,"Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...,Björk,Catherine Deneuve,David Morse,Peter Stormare
3,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,"In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm
4,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush


In [22]:
# Re-order.
# title, genre, release_year, runtime, rating, vote_count, director, cast, overview, gross
w_df = w_df.loc[:, ['title', 
                    'genre', 
                    'release_year', 
                    'runtime', 
                    'rating', 
                    'vote_count', 
                    'director', 
                    'cast1', 
                    'cast2', 
                    'cast3', 
                    'cast4', 
                    'overview', 
                    'gross',
                    'poster_link'
                   ]]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2437 non-null   object 
 1   genre         2437 non-null   object 
 2   release_year  2437 non-null   int64  
 3   runtime       2437 non-null   int64  
 4   rating        2437 non-null   float64
 5   vote_count    2437 non-null   int64  
 6   director      2437 non-null   object 
 7   cast1         2437 non-null   object 
 8   cast2         2424 non-null   object 
 9   cast3         2416 non-null   object 
 10  cast4         2415 non-null   object 
 11  overview      2436 non-null   object 
 12  gross         2437 non-null   int64  
 13  poster_link   2437 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 266.7+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link
0,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,"Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...
1,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,"Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...
2,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,Björk,Catherine Deneuve,David Morse,Peter Stormare,"Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...
3,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm,"In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...
4,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush,After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...


In [23]:
# One last thing... add a "source" column to state that these columns are data retrieved from the TMDB dataset.
w_df['source'] = ['TMDB' for x in range(0, len(w_df.index))]

display(w_df.info())
display(w_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437 entries, 0 to 2436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2437 non-null   object 
 1   genre         2437 non-null   object 
 2   release_year  2437 non-null   int64  
 3   runtime       2437 non-null   int64  
 4   rating        2437 non-null   float64
 5   vote_count    2437 non-null   int64  
 6   director      2437 non-null   object 
 7   cast1         2437 non-null   object 
 8   cast2         2424 non-null   object 
 9   cast3         2416 non-null   object 
 10  cast4         2415 non-null   object 
 11  overview      2436 non-null   object 
 12  gross         2437 non-null   int64  
 13  poster_link   2437 non-null   object 
 14  source        2437 non-null   object 
dtypes: float64(1), int64(4), object(10)
memory usage: 285.7+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,Finding Nemo,"Animation, Family",2003,100,7.82,19059,Andrew Stanton,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,"Nemo, an adventurous young clownfish, is unexp...",846335536,https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...,TMDB
1,American Beauty,Drama,1999,122,8.019,11903,Sam Mendes,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,"Lester Burnham, a depressed suburban father in...",341296601,https://image.tmdb.org/t/p/w500/wby9315QzVKdW9...,TMDB
2,Dancer in the Dark,"Drama, Crime",2000,140,7.9,1760,Lars von Trier,Björk,Catherine Deneuve,David Morse,Peter Stormare,"Selma, a Czech immigrant on the verge of blind...",27561153,https://image.tmdb.org/t/p/w500/8Wdd3fQfbbQeoS...,TMDB
3,The Fifth Element,"Adventure, Fantasy, Action, Thriller, Science ...",1997,126,7.552,10591,Luc Besson,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm,"In 2257, a taxi driver is unintentionally give...",173920180,https://image.tmdb.org/t/p/w500/fPtlCO1yQtnoLH...,TMDB
4,Pirates of the Caribbean: The Curse of the Bla...,"Adventure, Fantasy, Action",2003,143,7.805,20390,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Geoffrey Rush,After Port Royal is attacked and pillaged by a...,515011224,https://image.tmdb.org/t/p/w500/poHwCZeWzJCShH...,TMDB


In [24]:
# Looks good, save it off as the tmdb_df.
tmdb_df = w_df.copy()

In [25]:
# This is looking good. Time to add the two DataFrames together.
concat_df = pd.concat([imdb_df, tmdb_df]).reset_index(drop = True)

display(concat_df.info())
display(concat_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3437 entries, 0 to 3436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3437 non-null   object 
 1   genre         3437 non-null   object 
 2   release_year  3437 non-null   int64  
 3   runtime       3436 non-null   float64
 4   rating        3437 non-null   float64
 5   vote_count    3437 non-null   int64  
 6   director      3437 non-null   object 
 7   cast1         3437 non-null   object 
 8   cast2         3424 non-null   object 
 9   cast3         3416 non-null   object 
 10  cast4         3415 non-null   object 
 11  overview      3436 non-null   object 
 12  gross         3267 non-null   float64
 13  poster_link   3437 non-null   object 
 14  source        3437 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 402.9+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [26]:
# There are still quite a few missing cast members but that is to be expected.
# Some movies do not have the vast amount of starring cast as other movies.

# There is one missing value for overview. Let's look at it.
concat_df.loc[pd.isnull(concat_df['overview']), :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
3372,Namibia: A Change of Perspective,Documentary,2024,0.0,0.0,0,Anton Stamenkov,Anton Stamenkov,Radoslav Stamenkov,Stefan Stefanov,elephant,,0.0,https://image.tmdb.org/t/p/w500/8FbnBKUQpF0Fmm...,TMDB


In [27]:
# It has no runtime, rating, votes, or gross. Let's drop that.
concat_df.drop(2372, inplace = True)

display(concat_df.info())
display(concat_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3436 entries, 0 to 3436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3436 non-null   object 
 1   genre         3436 non-null   object 
 2   release_year  3436 non-null   int64  
 3   runtime       3435 non-null   float64
 4   rating        3436 non-null   float64
 5   vote_count    3436 non-null   int64  
 6   director      3436 non-null   object 
 7   cast1         3436 non-null   object 
 8   cast2         3423 non-null   object 
 9   cast3         3415 non-null   object 
 10  cast4         3414 non-null   object 
 11  overview      3435 non-null   object 
 12  gross         3266 non-null   float64
 13  poster_link   3436 non-null   object 
 14  source        3436 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 429.5+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [28]:
# Weird missing values.
# There's movies coming out in 2025 that shouldn't be here.
# There's an entire column filled with '1995'.
# There's movies without genres.
concat_df.loc[concat_df['genre'] == '', :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
2637,Sensual Massage: The Touch Of Love,,1980,27.0,5.5,4,Bruce Seth Green,,,,,Massage can be a thoroughly sensual experience...,0.0,https://image.tmdb.org/t/p/w500/qP8mfO3iHao1Pz...,TMDB
3336,Triple Oh!,,2023,45.0,0.0,0,Poppy Stockwell,Emmanuelle Mattana,Geraldine Hickey,Ethan Carruthers,Heather Maltman,"In this dark comedy-drama, a mismatched duo of...",0.0,https://image.tmdb.org/t/p/w500/qnE7f6Bea3DW21...,TMDB
3423,A Brighter Summer Day for the Lady Avengers,,2024,12.0,0.0,0,Birdy Wei-Ting Hung,,,,,"Taiwan, 1980s. A hot summer day, watermelon ju...",0.0,https://image.tmdb.org/t/p/w500/qxuO2lJsk3DkvC...,TMDB
3425,The Visit,,2024,14.0,0.0,0,Millie Y. Xu,,,,,A young woman descends into her own terrifying...,0.0,https://image.tmdb.org/t/p/w500/oLqbaeymdnRwJf...,TMDB


In [29]:
# Drop the missing genre movies... it's not that large of an amount and there's other missing data.
concat_df.drop(concat_df.loc[concat_df['genre'] == '', :].index, inplace = True)

concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3432 entries, 0 to 3436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3432 non-null   object 
 1   genre         3432 non-null   object 
 2   release_year  3432 non-null   int64  
 3   runtime       3431 non-null   float64
 4   rating        3432 non-null   float64
 5   vote_count    3432 non-null   int64  
 6   director      3432 non-null   object 
 7   cast1         3432 non-null   object 
 8   cast2         3422 non-null   object 
 9   cast3         3414 non-null   object 
 10  cast4         3413 non-null   object 
 11  overview      3431 non-null   object 
 12  gross         3262 non-null   float64
 13  poster_link   3432 non-null   object 
 14  source        3432 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 429.0+ KB


In [30]:
# Weird row with 1995 as most values.
concat_df.loc[concat_df['title'] == 1995, :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
966,1995,1995,1995,,1995.0,1995,1995,1995,1995,1995,1995,1995,,1995,IMDB


In [31]:
# Get rid of row consisting of 1995.
concat_df.drop(concat_df.loc[concat_df['genre'] == 1995, :].index, inplace = True)

In [32]:
# Get rid of movies coming out past the year 2024 that have been included.
concat_df.drop(concat_df.loc[concat_df['release_year'] > 2024, :].index, inplace = True)

concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3422 entries, 0 to 3436
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3422 non-null   object 
 1   genre         3422 non-null   object 
 2   release_year  3422 non-null   int64  
 3   runtime       3422 non-null   float64
 4   rating        3422 non-null   float64
 5   vote_count    3422 non-null   int64  
 6   director      3422 non-null   object 
 7   cast1         3422 non-null   object 
 8   cast2         3412 non-null   object 
 9   cast3         3404 non-null   object 
 10  cast4         3403 non-null   object 
 11  overview      3421 non-null   object 
 12  gross         3253 non-null   float64
 13  poster_link   3422 non-null   object 
 14  source        3422 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 427.8+ KB


In [33]:
# Movies with a rating of 0.
concat_df.loc[concat_df['rating'] == 0, :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
2741,Kraven the Hunter,"Action, Adventure, Thriller",2024,0.0,0.0,0,J.C. Chandor,Aaron Taylor-Johnson,Russell Crowe,Ariana DeBose,Fred Hechinger,"Sergei Kravinoff is a big game hunter, who tak...",0.0,https://image.tmdb.org/t/p/w500/i47IUSsN126K11...,TMDB
2923,Mufasa: The Lion King,"Adventure, Family, Drama",2024,0.0,0.0,0,Barry Jenkins,Aaron Pierre,Kelvin Harrison Jr.,Tiffany Boone,Kagiso Lediga,Rafiki relays the legend of Mufasa to lion cub...,0.0,https://image.tmdb.org/t/p/w500/9bXHaLlsFYpJUu...,TMDB
2990,Saving Bikini Bottom: The Sandy Cheeks Movie,"Animation, Comedy, Adventure, Family",2024,87.0,0.0,0,Liza Johnson,Carolyn Lawrence,Tom Kenny,Clancy Brown,Bill Fagerbakke,"When Bikini Bottom is scooped from the ocean, ...",-100000000.0,https://image.tmdb.org/t/p/w500/30YnfZdMNIV7no...,TMDB
2994,Woman of the Hour,"Drama, Crime, Mystery",2024,95.0,0.0,0,Anna Kendrick,Anna Kendrick,Daniel Zovatto,Autumn Best,Andy Thompson,The stranger-than-fiction story of an aspiring...,0.0,https://image.tmdb.org/t/p/w500/td4fbQkQ0bb9GY...,TMDB
2999,The Lord of the Rings: The War of the Rohirrim,"Animation, Fantasy, Action",2024,130.0,0.0,0,Kenji Kamiyama,Brian Cox,Gaia Wise,Luke Pasqualino,Miranda Otto,183 years before the events chronicled in the ...,0.0,https://image.tmdb.org/t/p/w500/qXKFlGX9n66zd4...,TMDB
3013,Red One,"Action, Adventure, Fantasy",2024,124.0,0.0,0,Jake Kasdan,Dwayne Johnson,Chris Evans,J.K. Simmons,Lucy Liu,After Santa Claus (code name: Red One) is kidn...,-250000000.0,https://image.tmdb.org/t/p/w500/cdqLnri3NEGcmf...,TMDB
3068,Venom: The Last Dance,"Action, Science Fiction, Adventure",2024,0.0,0.0,0,Kelly Marcel,Tom Hardy,Chiwetel Ejiofor,Juno Temple,Rhys Ifans,Eddie and Venom are on the run. Hunted by both...,0.0,https://image.tmdb.org/t/p/w500/aosm8NMQ3UyoBV...,TMDB
3099,Turn Me On,"Science Fiction, Romance, Comedy",2024,99.0,0.0,0,Michael Tyburski,Bel Powley,Nick Robinson,Justin H. Min,D'Arcy Carden,The film is set in a world where the inconveni...,0.0,https://image.tmdb.org/t/p/w500/nKGVPdGznZWeoR...,TMDB
3104,Sonic the Hedgehog 3,"Science Fiction, Family, Comedy, Action, Adven...",2024,0.0,0.0,0,Jeff Fowler,Ben Schwartz,Jim Carrey,Idris Elba,Colleen O'Shaughnessey,"Sonic, Knuckles, and Tails reunite against a p...",-22550000.0,https://image.tmdb.org/t/p/w500/mubt4bnVfpJ5lB...,TMDB
3183,House of Spoils,"Horror, Thriller",2024,101.0,0.0,0,Danielle Krudy,Ariana DeBose,Barbie Ferreira,Arian Moayed,Marton Csokas,An ambitious chef opens a restaurant on a remo...,0.0,https://image.tmdb.org/t/p/w500/udfMAdiyKhgAS4...,TMDB


In [34]:
# Drop those 0 rating movies. Most haven't even been released yet.
concat_df.drop(concat_df.loc[concat_df['rating'] == 0, :].index, inplace = True)

concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3387 entries, 0 to 3435
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3387 non-null   object 
 1   genre         3387 non-null   object 
 2   release_year  3387 non-null   int64  
 3   runtime       3387 non-null   float64
 4   rating        3387 non-null   float64
 5   vote_count    3387 non-null   int64  
 6   director      3387 non-null   object 
 7   cast1         3387 non-null   object 
 8   cast2         3381 non-null   object 
 9   cast3         3375 non-null   object 
 10  cast4         3374 non-null   object 
 11  overview      3387 non-null   object 
 12  gross         3218 non-null   float64
 13  poster_link   3387 non-null   object 
 14  source        3387 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 423.4+ KB


In [35]:
# Movies with 0 actors...
concat_df.loc[concat_df['cast1'] == '', :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
2934,Dory's Reef Cam,"Family, Animation, Comedy, Adventure",2020,182.0,6.1,132,Michal Makarewicz,,,,,Dive into the waters below and watch the aquat...,0.0,https://image.tmdb.org/t/p/w500/mMWLGu9pFymqip...,TMDB
3403,The Cult,Horror,2024,90.0,8.0,7,Alexandre Alonso,,,,,When a young woman relocates to a small town i...,0.0,https://image.tmdb.org/t/p/w500/mmDEyYd66Et6Gs...,TMDB


In [36]:
# Drop those.
concat_df.drop(concat_df.loc[concat_df['cast1'] == '', :].index, inplace = True)

concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3385 entries, 0 to 3435
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3385 non-null   object 
 1   genre         3385 non-null   object 
 2   release_year  3385 non-null   int64  
 3   runtime       3385 non-null   float64
 4   rating        3385 non-null   float64
 5   vote_count    3385 non-null   int64  
 6   director      3385 non-null   object 
 7   cast1         3385 non-null   object 
 8   cast2         3381 non-null   object 
 9   cast3         3375 non-null   object 
 10  cast4         3374 non-null   object 
 11  overview      3385 non-null   object 
 12  gross         3216 non-null   float64
 13  poster_link   3385 non-null   object 
 14  source        3385 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 423.1+ KB


In [37]:
concat_df.loc[concat_df['gross'] == 0, :]

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
1219,The Return of the King,"Animation, Fantasy, Adventure",1980,98.0,6.400,192,Jules Bass,Orson Bean,Roddy McDowall,John Huston,Theodore Gottlieb,Two Hobbits struggle to destroy the Ring in Mo...,0.0,https://image.tmdb.org/t/p/w500/sC5j2cUgGe1IQ3...,TMDB
1296,Inherit the Wind,Drama,1960,128.0,7.700,401,Stanley Kramer,Spencer Tracy,Fredric March,Gene Kelly,Dick York,Schoolteacher Bertram Cates is arrested for te...,0.0,https://image.tmdb.org/t/p/w500/7oaHcF0gCOt2lK...,TMDB
1389,Casper's Haunted Christmas,"Animation, Family, Fantasy",2000,84.0,5.376,181,Owen Hurley,Brendon Ryan Barrett,Kathleen Barr,Ian James Corlett,Graeme Kingston,"Kibosh, supreme ruler of all ghosts, decrees t...",0.0,https://image.tmdb.org/t/p/w500/3BFR30kh0O3NKR...,TMDB
1408,Van Helsing: The London Assignment,"Action, Animation, Fantasy, Horror",2004,30.0,6.173,146,Sharon Bridgeman,Hugh Jackman,Robbie Coltrane,David Wenham,Grey DeLisle,"It tells of the events before the film, in whi...",0.0,https://image.tmdb.org/t/p/w500/2MFdzeCL2YTqbb...,TMDB
1427,American Pie Presents: The Naked Mile,Comedy,2006,97.0,5.510,2008,Joe Nussbaum,John White,Jessy Schram,Steve Talley,Christopher McDonald,When Erik Stifler realizes that he's the only ...,0.0,https://image.tmdb.org/t/p/w500/sreeL5kKj47oof...,TMDB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3408,Lolo and the Kid,"Drama, Family",2024,97.0,6.833,15,Benedict Mique,Euwenn Mikaell,Joel Torre,Juan Karlos Labajo,Iza Calzado,A hustler and the child he took in routinely c...,0.0,https://image.tmdb.org/t/p/w500/27y5QGbOligWKc...,TMDB
3410,Throuple,Drama,2024,71.0,1.000,1,Aya Topacio,Audrey Avila,Sahara Bernales,Arah Alonzo,Aerol Carmelo,"Ysay and Hannah, who were in a throuple relati...",0.0,https://image.tmdb.org/t/p/w500/pZZdSbC1lY73lj...,TMDB
3411,Child Star,Documentary,2024,97.0,8.400,5,Demi Lovato,Demi Lovato,Drew Barrymore,Christina Ricci,Kenan Thompson,Explore the highs and lows of growing up in th...,0.0,https://image.tmdb.org/t/p/w500/qonoT25IHa7v2j...,TMDB
3412,Untold: Hope Solo vs. U.S. Soccer,Documentary,2024,73.0,5.400,10,Nina Meredith,Hope Solo,Michelle Akers,Pia Sundhage,Rich Nichols,World Cup champion Hope Solo opens up about he...,0.0,https://image.tmdb.org/t/p/w500/2vdEuZwQaEXFJ3...,TMDB


In [38]:
# Let's fill in some of the data that is still missing.
# We have a lot missing from gross.
# And 1 row is missing the runtime.
# First, we need to find the best way to impute the values.
concat_df[['runtime', 'gross']].describe()

Unnamed: 0,runtime,gross
count,3385.0,3216.0
mean,111.049335,87687650.0
std,27.756406,176670600.0
min,0.0,-199546000.0
25%,94.0,0.0
50%,107.0,20115890.0
75%,125.0,102089600.0
max,400.0,1860250000.0


In [39]:
# Looks like median is best for both.
concat_df.loc[pd.isnull(concat_df['runtime']), 'runtime'] = concat_df['runtime'].median()
concat_df.loc[concat_df['runtime'] == 0, 'runtime'] = concat_df['runtime'].median()
concat_df.loc[pd.isnull(concat_df['gross']), 'gross'] = concat_df['gross'].median()

display(concat_df.info())
display(concat_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3385 entries, 0 to 3435
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3385 non-null   object 
 1   genre         3385 non-null   object 
 2   release_year  3385 non-null   int64  
 3   runtime       3385 non-null   float64
 4   rating        3385 non-null   float64
 5   vote_count    3385 non-null   int64  
 6   director      3385 non-null   object 
 7   cast1         3385 non-null   object 
 8   cast2         3381 non-null   object 
 9   cast3         3375 non-null   object 
 10  cast4         3374 non-null   object 
 11  overview      3385 non-null   object 
 12  gross         3385 non-null   float64
 13  poster_link   3385 non-null   object 
 14  source        3385 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 423.1+ KB


None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,cast4,overview,gross,poster_link,source
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Two imprisoned men bond over a number of years...,28341469.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,IMDB
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,An organized crime dynasty's aging patriarch t...,134966411.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,IMDB
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,When the menace known as the Joker wreaks havo...,534858444.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,IMDB
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,The early life and career of Vito Corleone in ...,57300000.0,https://m.media-amazon.com/images/M/MV5BMWMwMG...,IMDB
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,A jury holdout attempts to prevent a miscarria...,4360000.0,https://m.media-amazon.com/images/M/MV5BMWU4N2...,IMDB


In [40]:
# Going to one-hot encode the genres.
dummy_df = concat_df.copy()

In [41]:
# Get list of unique genre pairings.
genre = dummy_df['genre'].str.split(', ').tolist()
genre

[['Drama'],
 ['Crime', 'Drama'],
 ['Action', 'Crime', 'Drama'],
 ['Crime', 'Drama'],
 ['Crime', 'Drama'],
 ['Action', 'Adventure', 'Drama'],
 ['Crime', 'Drama'],
 ['Biography', 'Drama', 'History'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Drama'],
 ['Action', 'Adventure', 'Drama'],
 ['Drama', 'Romance'],
 ['Western'],
 ['Action', 'Adventure', 'Drama'],
 ['Action', 'Sci-Fi'],
 ['Biography', 'Crime', 'Drama'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Drama'],
 ['Biography', 'Drama', 'History'],
 ['Comedy', 'Drama', 'Thriller'],
 ['Drama'],
 ['Adventure', 'Drama', 'Sci-Fi'],
 ['Crime', 'Drama'],
 ['Animation', 'Adventure', 'Family'],
 ['Drama', 'War'],
 ['Crime', 'Drama', 'Fantasy'],
 ['Comedy', 'Drama', 'Romance'],
 ['Crime', 'Drama', 'Mystery'],
 ['Crime', 'Drama', 'Thriller'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Drama', 'Mystery'],
 ['Action', 'Adventure', 'Drama'],
 ['Drama', 'Family', 'Fantasy'],
 ['Crime', 'Drama', 'Thriller'],
 ['Drama', 'Music'],
 ['Biography', 'Comedy'

In [42]:
# Flatten the list, make each part contain just one genre.
flat_genre = [item for piece in genre for item in piece]
flat_genre

['Drama',
 'Crime',
 'Drama',
 'Action',
 'Crime',
 'Drama',
 'Crime',
 'Drama',
 'Crime',
 'Drama',
 'Action',
 'Adventure',
 'Drama',
 'Crime',
 'Drama',
 'Biography',
 'Drama',
 'History',
 'Action',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Action',
 'Adventure',
 'Drama',
 'Drama',
 'Romance',
 'Western',
 'Action',
 'Adventure',
 'Drama',
 'Action',
 'Sci-Fi',
 'Biography',
 'Crime',
 'Drama',
 'Action',
 'Adventure',
 'Fantasy',
 'Drama',
 'Biography',
 'Drama',
 'History',
 'Comedy',
 'Drama',
 'Thriller',
 'Drama',
 'Adventure',
 'Drama',
 'Sci-Fi',
 'Crime',
 'Drama',
 'Animation',
 'Adventure',
 'Family',
 'Drama',
 'War',
 'Crime',
 'Drama',
 'Fantasy',
 'Comedy',
 'Drama',
 'Romance',
 'Crime',
 'Drama',
 'Mystery',
 'Crime',
 'Drama',
 'Thriller',
 'Action',
 'Adventure',
 'Fantasy',
 'Action',
 'Drama',
 'Mystery',
 'Action',
 'Adventure',
 'Drama',
 'Drama',
 'Family',
 'Fantasy',
 'Crime',
 'Drama',
 'Thriller',
 'Drama',
 'Music',
 'Biography',
 'Comedy',
 'Drama',
 'Drama

In [43]:
# Converting to a set will combine all similar values to make a unique set.
set_genre = set(flat_genre)
set_genre

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Science Fiction',
 'Sport',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [44]:
# But we need it as a list. Convert it back now.
unique_genre = list(set_genre)
unique_genre

['Sci-Fi',
 'Film-Noir',
 'War',
 'History',
 'Musical',
 'Fantasy',
 'Drama',
 'Horror',
 'TV Movie',
 'Music',
 'Mystery',
 'Crime',
 'Comedy',
 'Western',
 'Science Fiction',
 'Sport',
 'Action',
 'Documentary',
 'Animation',
 'Biography',
 'Family',
 'Romance',
 'Thriller',
 'Adventure']

In [45]:
# Sort the genres.
unique_genre = sorted(unique_genre, key = str.lower)
unique_genre

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Science Fiction',
 'Sport',
 'Thriller',
 'TV Movie',
 'War',
 'Western']

In [46]:
dummy_df = dummy_df.reindex(
    dummy_df.columns.tolist() + unique_genre, axis = 'columns', fill_value = 0)

display(dummy_df.info())
display(dummy_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3385 entries, 0 to 3435
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3385 non-null   object 
 1   genre            3385 non-null   object 
 2   release_year     3385 non-null   int64  
 3   runtime          3385 non-null   float64
 4   rating           3385 non-null   float64
 5   vote_count       3385 non-null   int64  
 6   director         3385 non-null   object 
 7   cast1            3385 non-null   object 
 8   cast2            3381 non-null   object 
 9   cast3            3375 non-null   object 
 10  cast4            3374 non-null   object 
 11  overview         3385 non-null   object 
 12  gross            3385 non-null   float64
 13  poster_link      3385 non-null   object 
 14  source           3385 non-null   object 
 15  Action           3385 non-null   int64  
 16  Adventure        3385 non-null   int64  
 17  Animation        33

None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,...,Musical,Mystery,Romance,Sci-Fi,Science Fiction,Sport,Thriller,TV Movie,War,Western
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,...,0,0,0,0,0,0,0,0,0,0
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,...,0,0,0,0,0,0,0,0,0,0
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,...,0,0,0,0,0,0,0,0,0,0
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,...,0,0,0,0,0,0,0,0,0,0
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# For each value inside of the genre column, update the appropriate dummy.
for index, row in dummy_df.iterrows():
    for v in row.genre.split(', '):
        if pd.isnull(v) == False:
            dummy_df.loc[index, v] = 1

# Check.
display(dummy_df.info())
display(dummy_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 3385 entries, 0 to 3435
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3385 non-null   object 
 1   genre            3385 non-null   object 
 2   release_year     3385 non-null   int64  
 3   runtime          3385 non-null   float64
 4   rating           3385 non-null   float64
 5   vote_count       3385 non-null   int64  
 6   director         3385 non-null   object 
 7   cast1            3385 non-null   object 
 8   cast2            3381 non-null   object 
 9   cast3            3375 non-null   object 
 10  cast4            3374 non-null   object 
 11  overview         3385 non-null   object 
 12  gross            3385 non-null   float64
 13  poster_link      3385 non-null   object 
 14  source           3385 non-null   object 
 15  Action           3385 non-null   int64  
 16  Adventure        3385 non-null   int64  
 17  Animation        33

None

Unnamed: 0,title,genre,release_year,runtime,rating,vote_count,director,cast1,cast2,cast3,...,Musical,Mystery,Romance,Sci-Fi,Science Fiction,Sport,Thriller,TV Movie,War,Western
0,The Shawshank Redemption,Drama,1994,142.0,9.3,2343110,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,...,0,0,0,0,0,0,0,0,0,0
1,The Godfather,"Crime, Drama",1972,175.0,9.2,1620367,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,...,0,0,0,0,0,0,0,0,0,0
2,The Dark Knight,"Action, Crime, Drama",2008,152.0,9.0,2303232,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,...,0,0,0,0,0,0,0,0,0,0
3,The Godfather: Part II,"Crime, Drama",1974,202.0,9.0,1129952,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,...,0,0,0,0,0,0,0,0,0,0
4,12 Angry Men,"Crime, Drama",1957,96.0,9.0,689845,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Check a few rows for accuracy.
dummy_df.loc[0, :]

title                                       The Shawshank Redemption
genre                                                          Drama
release_year                                                    1994
runtime                                                        142.0
rating                                                           9.3
vote_count                                                   2343110
director                                              Frank Darabont
cast1                                                    Tim Robbins
cast2                                                 Morgan Freeman
cast3                                                     Bob Gunton
cast4                                                 William Sadler
overview           Two imprisoned men bond over a number of years...
gross                                                     28341469.0
poster_link        https://m.media-amazon.com/images/M/MV5BMDFkYT...
source                            

In [49]:
dummy_df.loc[1, :]

title                                                  The Godfather
genre                                                   Crime, Drama
release_year                                                    1972
runtime                                                        175.0
rating                                                           9.2
vote_count                                                   1620367
director                                        Francis Ford Coppola
cast1                                                  Marlon Brando
cast2                                                      Al Pacino
cast3                                                     James Caan
cast4                                                   Diane Keaton
overview           An organized crime dynasty's aging patriarch t...
gross                                                    134966411.0
poster_link        https://m.media-amazon.com/images/M/MV5BM2MyNj...
source                            

In [50]:
dummy_df.loc[1000, :]

title                                                   Finding Nemo
genre                                              Animation, Family
release_year                                                    2003
runtime                                                        100.0
rating                                                          7.82
vote_count                                                     19059
director                                              Andrew Stanton
cast1                                                  Albert Brooks
cast2                                                Ellen DeGeneres
cast3                                                Alexander Gould
cast4                                                   Willem Dafoe
overview           Nemo, an adventurous young clownfish, is unexp...
gross                                                    846335536.0
poster_link        https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1md...
source                            

In [52]:
# All the datatypes are correct and we have no (important) missing values. Save it off.
filepath = "clean/movie_dataset.csv"

dummy_df.to_csv(filepath, index = False)