In [300]:
import requests
import pandas as pd
from dotenv import load_dotenv  # for loading environment variables
import os
from functions import fetch_movie_data, extract_awards_info, compute_aggregated_score, scrape_gross

load_dotenv()
api_key = os.getenv("API_KEY")

In [302]:
df = pd.read_csv("imdb_movies.csv")

In [303]:
df.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [305]:
df["genre"].isna().sum()

85

In [306]:
# Display rows where the 'Title' column contains a null value
null_rows_title = df[df['genre'].isnull()]

# Display the rows with null values in 'Title' column
print(null_rows_title)


                                       names       date_x  score genre  \
305         Housewife Sex Slaves: Hatano Yui  01/09/2015     0.0   NaN   
1174                 Beauty Rope Cosmetology  12/02/1983    10.0   NaN   
1561                                 Reclaim  07/29/2022    20.0   NaN   
1762              Ancient Chinese Whorehouse  09/15/1994    50.0   NaN   
1776       Porno document: Toruko tokkyû bin  02/26/1982   100.0   NaN   
...                                      ...          ...    ...   ...   
9626                                Euphoria  11/29/2022     0.0   NaN   
9733                                 Fanatic  04/06/2023     0.0   NaN   
10011                        Perfumed Garden  06/03/2000    53.0   NaN   
10025  The Girl and the Wooden Horse Torture  12/03/1982    50.0   NaN   
10076      The Shoga (Glass and Gas) Company  09/07/1990    37.0   NaN   

                                                overview  \
305    We don't have an overview translated in Engl

In [307]:
df_copy = df[["names", "orig_lang", "budget_x", "revenue"]]
df_copy

Unnamed: 0,names,orig_lang,budget_x,revenue
0,Creed III,English,75000000.0,2.716167e+08
1,Avatar: The Way of Water,English,460000000.0,2.316795e+09
2,The Super Mario Bros. Movie,English,100000000.0,7.244590e+08
3,Mummies,"Spanish, Castilian",12300000.0,3.420000e+07
4,Supercell,English,77000000.0,3.409420e+08
...,...,...,...,...
10173,20th Century Women,English,7000000.0,9.353729e+06
10174,Delta Force 2: The Colombian Connection,English,9145817.8,6.698361e+06
10175,The Russia House,English,21800000.0,2.299799e+07
10176,Darkman II: The Return of Durant,English,116000000.0,4.756613e+08


In [309]:
# Convert the 'date_x' column to datetime format if it's not already in datetime
df_copy['date_x'] = pd.to_datetime(df['date_x'])

# Filter rows where the year is between 1995 and 2020 (inclusive)
df_copy = df_copy[(df_copy['date_x'].dt.year >= 1995)]

# Display the filtered dataframe
print(df_copy.value_counts())


names                               orig_lang   budget_x     revenue      date_x    
#Alive                              Korean      6300000.0    13416285.0   2020-06-24    1
Shazam! Fury of the Gods            English     125000000.0  132107025.0  2023-03-16    1
Shiloh                              English     44400000.0   371501189.4  1996-11-30    1
Sherlock: The Abominable Bride      English     120800000.0  810491789.2  2016-01-05    1
Sherlock Holmes: A Game of Shadows  English     125000000.0  535663443.0  2012-01-05    1
                                                                                       ..
Haywire                             English     23000000.0   36374700.0   2012-03-14    1
Hawa                                French      51600000.0   249812559.2  2022-12-09    1
Havoc                               English     9000000.0    4176154.6    2005-10-16    1
Have a Nice Day!                    Portuguese  103040000.0  412503777.6  2023-03-10    1
솔라 플라워         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['date_x'] = pd.to_datetime(df['date_x'])


In [310]:
# Rename the 'names' column to 'Title'
df_copy.rename(columns={'names': 'Title'}, inplace=True)
df_copy.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy.rename(columns={'names': 'Title'}, inplace=True)


Unnamed: 0,Title,orig_lang,budget_x,revenue,date_x
0,Creed III,English,75000000.0,271616700.0,2023-03-02
1,Avatar: The Way of Water,English,460000000.0,2316795000.0,2022-12-15
2,The Super Mario Bros. Movie,English,100000000.0,724459000.0,2023-04-05
3,Mummies,"Spanish, Castilian",12300000.0,34200000.0,2023-01-05
4,Supercell,English,77000000.0,340942000.0,2023-03-17


In [311]:
df_copy = df_copy.drop_duplicates(subset=['Title']) #Ben add

In [313]:
api_data_list = []
for title in df_copy['Title'].unique():  # Avoid repeated API calls for the same title
    movie_data = fetch_movie_data(title)
    if movie_data and movie_data.get('Response') == 'True':  # Only append valid data
        api_data_list.append(movie_data)

In [316]:
# Create a DataFrame from the list of API data
df_api = pd.DataFrame(api_data_list, columns=["Title", "Year", "Rated", "Runtime", "Genre", "Director", "Actors", "Plot", "Country", "Awards", "Metascore", "imdbRating", "imdbVotes", "imdbID"])
df_api

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Actors,Plot,Country,Awards,Metascore,imdbRating,imdbVotes,imdbID
0,Creed III,2023,PG-13,116 min,"Action, Drama, Sport",Michael B. Jordan,"Michael B. Jordan, Tessa Thompson, Jonathan Ma...",Adonis has been thriving in both his career an...,United States,28 nominations,73,6.7,95684,tt11145118
1,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver",Jake Sully lives with his newfound family form...,United States,Won 1 Oscar. 75 wins & 150 nominations total,67,7.5,509749,tt1630029
2,The Super Mario Bros. Movie,2023,PG,92 min,"Animation, Adventure, Comedy","Aaron Horvath, Michael Jelenic, Pierre Leduc","Chris Pratt, Anya Taylor-Joy, Charlie Day",A plumber named Mario travels through an under...,"United States, Japan",2 wins & 47 nominations,46,7.0,246752,tt6718170
3,Mummies,2023,PG,88 min,"Animation, Adventure, Comedy",Juan Jesús García Galocha,"Óscar Barberán, Ana Esther Alborg, Luis Reina",It follows three mummies as they end up in pre...,"Spain, United States",3 nominations,,5.9,9366,tt23177868
4,Supercell,2023,PG-13,100 min,"Action, Adventure, Drama",Herbert James Winterstern,"Skeet Ulrich, Anne Heche, Daniel Diemer",A teenage boy runs away to follow in the foots...,United States,3 wins & 1 nomination,,4.4,3251,tt10559102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7573,The Love Guru,2008,PG-13,87 min,"Comedy, Romance, Sport",Marco Schnabel,"Mike Myers, Jessica Alba, Romany Malco","Pitka, an American raised outside of his count...","United Kingdom, Germany, United States",5 wins & 8 nominations,24,3.8,55340,tt0811138
7574,The Seven Deadly Sins: Prisoners of the Sky,2018,TV-14,99 min,"Animation, Action, Adventure","Yasuto Nishikata, Noriyuki Abe","Yûki Kaji, Sora Amamiya, Misaki Kuno",The Seven Deadly Sins travel to the Sky Temple...,Japan,1 nomination,,7.0,5632,tt9089294
7575,20th Century Women,2016,R,119 min,"Comedy, Drama",Mike Mills,"Annette Bening, Elle Fanning, Greta Gerwig","In 1979 Santa Barbara, Dorothea is a determine...",United States,Nominated for 1 Oscar. 15 wins & 82 nomination...,83,7.3,50036,tt4385888
7576,Darkman II: The Return of Durant,1995,R,93 min,"Action, Crime, Horror, Sci-Fi, Thriller",Bradford May,"Larry Drake, Arnold Vosloo, Kim Delaney, Renée...",Darkman and Durant return and they hate each o...,"USA, Canada",,,5.1,5007,tt0109552


In [320]:
# Merge the dataframes on the 'Title' column
df_merged = pd.merge(df_api, df_copy, on='Title', how='inner')

# Display the merged dataframe
print(df_merged.head())



                         Title  Year  Rated  Runtime  \
0                    Creed III  2023  PG-13  116 min   
1     Avatar: The Way of Water  2022  PG-13  192 min   
2  The Super Mario Bros. Movie  2023     PG   92 min   
3                      Mummies  2023     PG   88 min   
4                    Supercell  2023  PG-13  100 min   

                          Genre                                      Director  \
0          Action, Drama, Sport                             Michael B. Jordan   
1    Action, Adventure, Fantasy                                 James Cameron   
2  Animation, Adventure, Comedy  Aaron Horvath, Michael Jelenic, Pierre Leduc   
3  Animation, Adventure, Comedy                     Juan Jesús García Galocha   
4      Action, Adventure, Drama                     Herbert James Winterstern   

                                              Actors  \
0  Michael B. Jordan, Tessa Thompson, Jonathan Ma...   
1     Sam Worthington, Zoe Saldana, Sigourney Weaver   
2       

In [322]:
# Drop duplicates based on the 'Title' and 'imdbID' columns
df_merged.drop_duplicates(subset=['Title', 'imdbID'], inplace=True)

In [324]:
# Reset the index of the merged dataframe
df_merged.reset_index(drop=True, inplace=True)

In [326]:
# Create a copy of the merged dataframe
df_clean = df_merged.copy()
df_clean

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Actors,Plot,Country,Awards,Metascore,imdbRating,imdbVotes,imdbID,orig_lang,budget_x,revenue,date_x
0,Creed III,2023,PG-13,116 min,"Action, Drama, Sport",Michael B. Jordan,"Michael B. Jordan, Tessa Thompson, Jonathan Ma...",Adonis has been thriving in both his career an...,United States,28 nominations,73,6.7,95684,tt11145118,English,75000000.0,2.716167e+08,2023-03-02
1,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver",Jake Sully lives with his newfound family form...,United States,Won 1 Oscar. 75 wins & 150 nominations total,67,7.5,509749,tt1630029,English,460000000.0,2.316795e+09,2022-12-15
2,The Super Mario Bros. Movie,2023,PG,92 min,"Animation, Adventure, Comedy","Aaron Horvath, Michael Jelenic, Pierre Leduc","Chris Pratt, Anya Taylor-Joy, Charlie Day",A plumber named Mario travels through an under...,"United States, Japan",2 wins & 47 nominations,46,7.0,246752,tt6718170,English,100000000.0,7.244590e+08,2023-04-05
3,Mummies,2023,PG,88 min,"Animation, Adventure, Comedy",Juan Jesús García Galocha,"Óscar Barberán, Ana Esther Alborg, Luis Reina",It follows three mummies as they end up in pre...,"Spain, United States",3 nominations,,5.9,9366,tt23177868,"Spanish, Castilian",12300000.0,3.420000e+07,2023-01-05
4,Supercell,2023,PG-13,100 min,"Action, Adventure, Drama",Herbert James Winterstern,"Skeet Ulrich, Anne Heche, Daniel Diemer",A teenage boy runs away to follow in the foots...,United States,3 wins & 1 nomination,,4.4,3251,tt10559102,English,77000000.0,3.409420e+08,2023-03-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7095,The Love Guru,2008,PG-13,87 min,"Comedy, Romance, Sport",Marco Schnabel,"Mike Myers, Jessica Alba, Romany Malco","Pitka, an American raised outside of his count...","United Kingdom, Germany, United States",5 wins & 8 nominations,24,3.8,55340,tt0811138,English,62000000.0,4.015902e+07,2008-07-10
7096,The Seven Deadly Sins: Prisoners of the Sky,2018,TV-14,99 min,"Animation, Action, Adventure","Yasuto Nishikata, Noriyuki Abe","Yûki Kaji, Sora Amamiya, Misaki Kuno",The Seven Deadly Sins travel to the Sky Temple...,Japan,1 nomination,,7.0,5632,tt9089294,Japanese,62600000.0,7.060021e+08,2018-08-18
7097,20th Century Women,2016,R,119 min,"Comedy, Drama",Mike Mills,"Annette Bening, Elle Fanning, Greta Gerwig","In 1979 Santa Barbara, Dorothea is a determine...",United States,Nominated for 1 Oscar. 15 wins & 82 nomination...,83,7.3,50036,tt4385888,English,7000000.0,9.353729e+06,2016-12-28
7098,Darkman II: The Return of Durant,1995,R,93 min,"Action, Crime, Horror, Sci-Fi, Thriller",Bradford May,"Larry Drake, Arnold Vosloo, Kim Delaney, Renée...",Darkman and Durant return and they hate each o...,"USA, Canada",,,5.1,5007,tt0109552,English,116000000.0,4.756613e+08,1995-07-11


In [327]:
# DATA CLEANING : Column names
df_clean.columns = [col.lower() for col in df_clean.columns]
df_clean.rename(columns={'imdbrating': 'imdb_rating', 'imdbvotes': 'imdb_votes', 'imdbid':'imdb_id', 'orig_lang': 'languages', 'budget_x': 'budget_M', 'revenue': 'US_CA_revenue_M', 'date_x': 'release_date'}, inplace=True)

In [329]:
# Revenue in $M
df_clean["US_CA_revenue_M"] = (df_clean["US_CA_revenue_M"]/1000000).round(2)
df_clean["budget_M"] = (df_clean["budget_M"]/1000000).round(2)


In [331]:
# Remove the 'min' suffix and extra whitespace from the 'runtime' column
df_clean["runtime"] = df_clean["runtime"].apply(lambda x: x.strip().split(" min")[0] if "min" in x else x.strip())

# Check the unique values to ensure whitespaces are removed
print(df_clean["runtime"].unique())


['116' '192' '92' '88' '100' '95' '169' '102' '24' '76' '122' '90' '161'
 '103' '107' '111' '14' '117' '99' '127' '130' '91' '125' '94' '124' '2'
 '82' '83' '101' '84' '105' '93' '126' '3' '97' '134' '45' '96' '162'
 '118' '85' '138' '148' '60' '106' '147' '132' 'N/A' '112' '139' '135'
 '149' '115' '87' '133' '141' '119' '176' '143' '98' '77' '110' '89' '114'
 '75' '30' '86' '189' '136' '137' '142' '73' '104' '129' '157' '153' '109'
 '150' '180' '187' '181' '140' '146' '113' '242' '158' '156' '81' '108'
 '194' '123' '80' '40' '128' '151' '42' '79' '131' '144' '120' '46' '70'
 '20' '121' '72' '78' '178' '44' '212' '165' '65' '55' '52' '23' '201'
 '152' '179' '163' '363' '5' '51' '57' '154' '29' '159' '34' '43' '10'
 '25' '9' '155' '164' '6' '240' '262' '13' '36' '26' '22' '71' '8' '33'
 '27' '1' '56' '54' '59' '170' '69' '31S' '7' '173' '145' '183' '74' '35'
 '166' '53' '61' '167' '38' '21' '67' '68' '19' '64' '63' '209' '48' '168'
 '50' '175' '15' '32' '41' '247' '28' '4' '172' '498' '

In [332]:
# Remove rows where the 'runtime' column ends with 'S'
df_clean = df_clean[~df_clean['runtime'].str.endswith('S')]

# Display the DataFrame after filtering
print(df_clean["runtime"].unique())


['116' '192' '92' '88' '100' '95' '169' '102' '24' '76' '122' '90' '161'
 '103' '107' '111' '14' '117' '99' '127' '130' '91' '125' '94' '124' '2'
 '82' '83' '101' '84' '105' '93' '126' '3' '97' '134' '45' '96' '162'
 '118' '85' '138' '148' '60' '106' '147' '132' 'N/A' '112' '139' '135'
 '149' '115' '87' '133' '141' '119' '176' '143' '98' '77' '110' '89' '114'
 '75' '30' '86' '189' '136' '137' '142' '73' '104' '129' '157' '153' '109'
 '150' '180' '187' '181' '140' '146' '113' '242' '158' '156' '81' '108'
 '194' '123' '80' '40' '128' '151' '42' '79' '131' '144' '120' '46' '70'
 '20' '121' '72' '78' '178' '44' '212' '165' '65' '55' '52' '23' '201'
 '152' '179' '163' '363' '5' '51' '57' '154' '29' '159' '34' '43' '10'
 '25' '9' '155' '164' '6' '240' '262' '13' '36' '26' '22' '71' '8' '33'
 '27' '1' '56' '54' '59' '170' '69' '7' '173' '145' '183' '74' '35' '166'
 '53' '61' '167' '38' '21' '67' '68' '19' '64' '63' '209' '48' '168' '50'
 '175' '15' '32' '41' '247' '28' '4' '172' '498' '171' '

In [336]:
# Apply the extraction function to the 'awards' column and create two new columns: 'wins' and 'nominations'
df_clean[['wins', 'nominations']] = df_clean['awards'].apply(lambda x: pd.Series(extract_awards_info(x)))

# Display the updated DataFrame with the extracted 'wins' and 'nominations' columns
print(df_clean[['awards', 'wins', 'nominations']])


                                                 awards  wins  nominations
0                                        28 nominations     0           28
1          Won 1 Oscar. 75 wins & 150 nominations total    76          150
2                               2 wins & 47 nominations     2           47
3                                         3 nominations     0            3
4                                 3 wins & 1 nomination     3            1
...                                                 ...   ...          ...
7095                             5 wins & 8 nominations     5            8
7096                                       1 nomination     0            1
7097  Nominated for 1 Oscar. 15 wins & 82 nomination...    15           82
7098                                                N/A     0            0
7099                                                N/A     0            0

[7095 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[['wins', 'nominations']] = df_clean['awards'].apply(lambda x: pd.Series(extract_awards_info(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[['wins', 'nominations']] = df_clean['awards'].apply(lambda x: pd.Series(extract_awards_info(x)))


In [339]:
# Define a dictionary to map specific ratings to broader categories
rating_mapping = {
    'G': 'G', 'TV-G': 'G', 'TV-Y': 'G', 'TV-Y7': 'G',
    'PG': 'PG', 'PG-13': 'PG', 'TV-PG': 'PG', 'TV-Y7-FV': 'G',
    'R': 'R', 'TV-MA': 'R', 'NC-17': 'R', 'MA-17': 'R', '18+': 'R', 'X': 'R',
    'Not Rated': 'Unrated', 'Unrated': 'Unrated', 'N/A': 'Unrated',
    'Approved': 'Unrated', 'Passed': 'Unrated', 'E': 'Unrated',
    '13+': 'PG', '16+': 'R', '12': 'PG', 'TV-14': 'PG', 'TV-Y7-FV': 'G'
}

# Apply the mapping to the 'rated' column
df_clean['rated'] = df_clean['rated'].map(rating_mapping)

# Check the unique values after mapping to ensure they are clean
print(df_clean['rated'].unique())


['PG' 'R' 'Unrated' 'G']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['rated'] = df_clean['rated'].map(rating_mapping)


In [340]:
# Changing the type of metascore to integer
df_clean.metascore = df_clean.metascore.apply(lambda x: int(x)/10 if x.isdigit() else None)
df_clean.metascore.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.metascore = df_clean.metascore.apply(lambda x: int(x)/10 if x.isdigit() else None)


array([ 7.3,  6.7,  4.6,  nan,  5.4,  7.8,  8.1,  4.4,  4.5,  6.3,  3.7,
        5.3,  6.2,  4.7,  4. ,  4.1,  7.2,  4.8,  7.5,  6. ,  3.8,  5.1,
        5. ,  1.9,  5.8,  6.1,  8.3,  6.6,  5.7,  5.9,  6.8,  7.1,  7.9,
        6.5,  7.6,  5.5,  7.7,  6.9,  3. ,  5.6,  8.2,  5.2,  4.9,  8.4,
        7. ,  7.4,  3.4,  8. ,  8.8,  6.4,  4.2,  8.6,  3.5,  8.5,  2.9,
        3.1,  8.7,  9.3,  3.2,  3.6,  9.6,  4.3,  9.9,  9.4,  1.4,  3.3,
        9.2,  9. ,  3.9,  1.7,  2.8,  2.7,  2.6,  9.5,  9.1,  2.5,  2. ,
        8.9,  2.2,  1.6,  2.3,  2.4,  2.1,  9.8,  1.2,  0.9,  1.8,  1.3,
        1.1,  1. , 10. ,  1.5,  9.7,  0.7,  0.6])

In [341]:
# Changing the type of imdb_rating to integer
df_clean.imdb_rating = df_clean.imdb_rating.apply(lambda x: float(x) if x != "N/A" else None)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.imdb_rating = df_clean.imdb_rating.apply(lambda x: float(x) if x != "N/A" else None)


In [344]:
# Apply the function to each row to create the 'aggregated_score' column
df_clean['aggregated_score'] = df_clean.apply(compute_aggregated_score, axis=1)

# Display the updated DataFrame with the new column
print(df_clean[['imdb_rating', 'metascore', 'aggregated_score']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['aggregated_score'] = df_clean.apply(compute_aggregated_score, axis=1)


      imdb_rating  metascore  aggregated_score
0             6.7        7.3               7.0
1             7.5        6.7               7.1
2             7.0        4.6               5.8
3             5.9        NaN               5.9
4             4.4        NaN               4.4
...           ...        ...               ...
7095          3.8        2.4               3.1
7096          7.0        NaN               7.0
7097          7.3        8.3               7.8
7098          5.1        NaN               5.1
7099          5.6        NaN               5.6

[7095 rows x 3 columns]


In [345]:
# Set the 'imdb_votes' column to a $M numeric base
df_clean.imdb_votes_thousands = df_clean.imdb_votes.apply(lambda x: int(x.replace(',', ''))/1000 if x != "N/A" else None)
print(df_clean.imdb_votes.unique())

['95,684' '509,749' '246,752' ... '5,632' '50,036' '115']


  df_clean.imdb_votes_thousands = df_clean.imdb_votes.apply(lambda x: int(x.replace(',', ''))/1000 if x != "N/A" else None)


In [346]:
#Another copy of df_clean2
df_clean2 = df_clean.copy()

In [347]:
gross_values = []

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}



for imdb_id in df_clean['imdb_id']:

    gross_value = scrape_gross(imdb_id)
    gross_values.append(gross_value)


df_clean2['cumulative_worldwide_gross'] = gross_values


Success: tt11145118 returned status 200
Success: tt1630029 returned status 200
Success: tt6718170 returned status 200
Success: tt23177868 returned status 200
Success: tt10559102 returned status 200
Success: tt14209916 returned status 200
Success: tt10366206 returned status 200
Success: tt3915174 returned status 200
Success: tt2560140 returned status 200
Success: tt21281688 returned status 200
Success: tt0070047 returned status 200
Success: tt15255288 returned status 200
Success: tt9114286 returned status 200
Success: tt13375076 returned status 200
Success: tt4471908 returned status 200
Success: tt15679400 returned status 200
Success: tt9663168 returned status 200
Success: tt7584264 returned status 200
Success: tt18092672 returned status 200
Success: tt2560092 returned status 200
Success: tt5884796 returned status 200
Success: tt0335345 returned status 200
Success: tt10151854 returned status 200
Success: tt14993352 returned status 200
Success: tt21426434 returned status 200
Success: tt6

In [354]:
# Turn cumulative worldwide gross into $M
df_clean2["cumulative_worldwide_gross"] = df_clean2["cumulative_worldwide_gross"].apply(lambda x : (float((x.split("$")[1]).replace(",", ""))/1000000) if x else x).round(2)

In [1]:
# Saving the dataframe into a csv
#df_clean2.to_csv("imdb_cleaned.csv", index=False)