In [1]:
import pandas as pd
import json
from bs4 import BeautifulSoup
import requests
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
# the cast column in the csv file is in json format, this function can turn the column in json to python object.
def load_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [4]:
credits = load_credits('tmdb_5000_credits.csv')
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
print(sorted(credits.cast.iloc[0][0].keys()))

['cast_id', 'character', 'credit_id', 'gender', 'id', 'name', 'order']


In [24]:
# only get the first actor from cast column
def get_lead_actor(row):
    try:
        lead_actor = row[0].get("name")
        return lead_actor
    except IndexError:
        return "has some issues"        

In [27]:
credits['lead_actor_name'] = credits['cast'].map(get_lead_actor)
credits.head()

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Sam Worthington
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",Johnny Depp
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",Daniel Craig
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",Christian Bale
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",Taylor Kitsch


In [160]:
#reduce the size of dataframe to 100 movies
credits_100 = credits.head(100)
credits_100

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Sam Worthington
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",Johnny Depp
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",Daniel Craig
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",Christian Bale
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",Taylor Kitsch
5,559,Spider-Man 3,"[{'cast_id': 30, 'character': 'Peter Parker / ...","[{""credit_id"": ""52fe4252c3a36847f80151a5"", ""de...",Tobey Maguire
6,38757,Tangled,"[{'cast_id': 34, 'character': 'Flynn Rider (vo...","[{""credit_id"": ""52fe46db9251416c91062101"", ""de...",Zachary Levi
7,99861,Avengers: Age of Ultron,"[{'cast_id': 76, 'character': 'Tony Stark / Ir...","[{""credit_id"": ""55d5f7d4c3a3683e7e0016eb"", ""de...",Robert Downey Jr.
8,767,Harry Potter and the Half-Blood Prince,"[{'cast_id': 3, 'character': 'Harry Potter', '...","[{""credit_id"": ""52fe4273c3a36847f801fab1"", ""de...",Daniel Radcliffe
9,209112,Batman v Superman: Dawn of Justice,"[{'cast_id': 18, 'character': 'Bruce Wayne / B...","[{""credit_id"": ""553bf23692514135c8002886"", ""de...",Ben Affleck


In [161]:
#make sure our samples all have lead actor name
credits_200[credits_200['lead_actor_name'] == "has some issues"]

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name,URL,WikiData,Birthday,Full Name,Birthplace


In [162]:
def to_scrape(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.2171.95 Safari/537.36'}
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.content, 'html.parser')
        bday = soup.find('span', {'class': 'bday'}).text
        nickname = soup.find('div', {'class': 'nickname'}).text
        birthplace = soup.find('div', {'class': 'birthplace'}).text
        return {'Birthday': bday, 'Nickname': nickname, 'Birthplace': birthplace}
    except:
        return {'Birthday': 'None', 'Nickname': 'None', 'Birthplace': 'None'}

In [163]:
credits_100['URL'] = credits_100['lead_actor_name'].map(lambda x: 'https://en.wikipedia.org/wiki/' + x.replace(' ', '_'))
credits_100.head()

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name,URL
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Sam Worthington,https://en.wikipedia.org/wiki/Sam_Worthington
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",Johnny Depp,https://en.wikipedia.org/wiki/Johnny_Depp
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",Daniel Craig,https://en.wikipedia.org/wiki/Daniel_Craig
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",Christian Bale,https://en.wikipedia.org/wiki/Christian_Bale
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",Taylor Kitsch,https://en.wikipedia.org/wiki/Taylor_Kitsch


In [164]:
credits_100['WikiData'] = credits_100['URL'].map(to_scrape)

In [165]:
credits_100['Birthday'] = credits_100['WikiData'].map(lambda x : x.get('Birthday'))
credits_100['Full Name'] = credits_100['WikiData'].map(lambda x : x.get('Nickname'))
credits_100['Birthplace'] = credits_100['WikiData'].map(lambda x : x.get('Birthplace'))

In [166]:
credits_100

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name,URL,WikiData,Birthday,Full Name,Birthplace
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Sam Worthington,https://en.wikipedia.org/wiki/Sam_Worthington,"{'Birthday': '1976-08-02', 'Nickname': 'Samuel...",1976-08-02,Samuel Henry John Worthington,"Godalming, Surrey, England, UK"
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",Johnny Depp,https://en.wikipedia.org/wiki/Johnny_Depp,"{'Birthday': '1963-06-09', 'Nickname': 'John C...",1963-06-09,John Christopher Depp II,"Owensboro, Kentucky, U.S."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",Daniel Craig,https://en.wikipedia.org/wiki/Daniel_Craig,"{'Birthday': '1968-03-02', 'Nickname': 'Daniel...",1968-03-02,Daniel Wroughton Craig,"Chester, Cheshire, England"
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",Christian Bale,https://en.wikipedia.org/wiki/Christian_Bale,"{'Birthday': '1974-01-30', 'Nickname': 'Christ...",1974-01-30,Christian Charles Philip Bale,"Haverfordwest, Pembrokeshire, Wales"
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",Taylor Kitsch,https://en.wikipedia.org/wiki/Taylor_Kitsch,"{'Birthday': 'None', 'Nickname': 'None', 'Birt...",,,
5,559,Spider-Man 3,"[{'cast_id': 30, 'character': 'Peter Parker / ...","[{""credit_id"": ""52fe4252c3a36847f80151a5"", ""de...",Tobey Maguire,https://en.wikipedia.org/wiki/Tobey_Maguire,"{'Birthday': '1975-06-27', 'Nickname': 'Tobias...",1975-06-27,Tobias Vincent Maguire,"Santa Monica, California, U.S."
6,38757,Tangled,"[{'cast_id': 34, 'character': 'Flynn Rider (vo...","[{""credit_id"": ""52fe46db9251416c91062101"", ""de...",Zachary Levi,https://en.wikipedia.org/wiki/Zachary_Levi,"{'Birthday': '1980-09-29', 'Nickname': 'Zachar...",1980-09-29,Zachary Levi Pugh,"Lake Charles, Louisiana, U.S."
7,99861,Avengers: Age of Ultron,"[{'cast_id': 76, 'character': 'Tony Stark / Ir...","[{""credit_id"": ""55d5f7d4c3a3683e7e0016eb"", ""de...",Robert Downey Jr.,https://en.wikipedia.org/wiki/Robert_Downey_Jr.,"{'Birthday': '1965-04-04', 'Nickname': 'Robert...",1965-04-04,Robert John Downey Jr.,"New York City, New York, U.S."
8,767,Harry Potter and the Half-Blood Prince,"[{'cast_id': 3, 'character': 'Harry Potter', '...","[{""credit_id"": ""52fe4273c3a36847f801fab1"", ""de...",Daniel Radcliffe,https://en.wikipedia.org/wiki/Daniel_Radcliffe,"{'Birthday': '1989-07-23', 'Nickname': 'Daniel...",1989-07-23,Daniel Jacob Radcliffe,"London, England"
9,209112,Batman v Superman: Dawn of Justice,"[{'cast_id': 18, 'character': 'Bruce Wayne / B...","[{""credit_id"": ""553bf23692514135c8002886"", ""de...",Ben Affleck,https://en.wikipedia.org/wiki/Ben_Affleck,"{'Birthday': '1972-08-15', 'Nickname': 'Benjam...",1972-08-15,Benjamin Géza Affleck-Boldt,"Berkeley, California, U.S."


In [167]:
#we have 18 rows that failed to get info from wiki 
credits_100[credits_200['Birthday'] == "None"].shape

(18, 10)

In [169]:
# import tldextract
# tldextract.extract('https://www.imdb.com/title/tt0295725/&sa=U&ved=0ahUKEwja2N_OyeHgAhWDpZ4KHXpPBr0QFggnMAM&usg=AOvVaw1QGHVRa_pgfziq_WY6DS08')

In [168]:
from splinter import Browser
import time

In [170]:
# go to google search for "[movie_name] reviews" then check whether imdb web exists, if exists, get the url.

def get_IMDB_urls(movie_name):
    try:
        executable_path = {'executable_path':'/users/Aihua Chen/Downloads/chromedriver'}
        browser = Browser('chrome', **executable_path)
        browser.visit('http://google.com')
        browser.fill('q', '{} reviews'.format(movie_name))
        time.sleep(2)
        browser.find_by_name('btnK').click()

        if browser.is_text_present('www.imdb.com'):
            html_text = browser.html
            soup = BeautifulSoup(html_text, 'html.parser')
            imdb_url = []
            for x in soup.find_all('div', {'class': 'r'}):
                url = x.find('a').attrs.get('href')
                if "imdb" in url:
                    imdb_url.append(url)
            return imdb_url     
        else:
            return ("No, it wasn't found")

        time.sleep(2)
        browser.quit()
    except:
        return "Run into erro"
        

In [171]:
#just an example
get_IMDB_urls('Avatar')

['https://www.imdb.com/title/tt0499549/reviews',
 'https://www.imdb.com/title/tt0499549/criticreviews']

In [172]:
credits_100['IMDB_reviews_url'] = credits_100['title'].map(get_IMDB_urls)

In [173]:
credits_100

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name,URL,WikiData,Birthday,Full Name,Birthplace,IMDB_reviews_url
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Sam Worthington,https://en.wikipedia.org/wiki/Sam_Worthington,"{'Birthday': '1976-08-02', 'Nickname': 'Samuel...",1976-08-02,Samuel Henry John Worthington,"Godalming, Surrey, England, UK","[https://www.imdb.com/title/tt0499549/reviews,..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",Johnny Depp,https://en.wikipedia.org/wiki/Johnny_Depp,"{'Birthday': '1963-06-09', 'Nickname': 'John C...",1963-06-09,John Christopher Depp II,"Owensboro, Kentucky, U.S.",[https://www.imdb.com/title/tt0449088/reviews]
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",Daniel Craig,https://en.wikipedia.org/wiki/Daniel_Craig,"{'Birthday': '1968-03-02', 'Nickname': 'Daniel...",1968-03-02,Daniel Wroughton Craig,"Chester, Cheshire, England","[https://www.imdb.com/title/tt2379713/reviews,..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",Christian Bale,https://en.wikipedia.org/wiki/Christian_Bale,"{'Birthday': '1974-01-30', 'Nickname': 'Christ...",1974-01-30,Christian Charles Philip Bale,"Haverfordwest, Pembrokeshire, Wales","No, it wasn't found"
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",Taylor Kitsch,https://en.wikipedia.org/wiki/Taylor_Kitsch,"{'Birthday': 'None', 'Nickname': 'None', 'Birt...",,,,[https://www.imdb.com/title/tt0401729/reviews]
5,559,Spider-Man 3,"[{'cast_id': 30, 'character': 'Peter Parker / ...","[{""credit_id"": ""52fe4252c3a36847f80151a5"", ""de...",Tobey Maguire,https://en.wikipedia.org/wiki/Tobey_Maguire,"{'Birthday': '1975-06-27', 'Nickname': 'Tobias...",1975-06-27,Tobias Vincent Maguire,"Santa Monica, California, U.S.",[https://www.imdb.com/title/tt0413300/reviews]
6,38757,Tangled,"[{'cast_id': 34, 'character': 'Flynn Rider (vo...","[{""credit_id"": ""52fe46db9251416c91062101"", ""de...",Zachary Levi,https://en.wikipedia.org/wiki/Zachary_Levi,"{'Birthday': '1980-09-29', 'Nickname': 'Zachar...",1980-09-29,Zachary Levi Pugh,"Lake Charles, Louisiana, U.S.",[https://www.imdb.com/title/tt0398286/reviews]
7,99861,Avengers: Age of Ultron,"[{'cast_id': 76, 'character': 'Tony Stark / Ir...","[{""credit_id"": ""55d5f7d4c3a3683e7e0016eb"", ""de...",Robert Downey Jr.,https://en.wikipedia.org/wiki/Robert_Downey_Jr.,"{'Birthday': '1965-04-04', 'Nickname': 'Robert...",1965-04-04,Robert John Downey Jr.,"New York City, New York, U.S.","[https://www.imdb.com/title/tt2395427/reviews,..."
8,767,Harry Potter and the Half-Blood Prince,"[{'cast_id': 3, 'character': 'Harry Potter', '...","[{""credit_id"": ""52fe4273c3a36847f801fab1"", ""de...",Daniel Radcliffe,https://en.wikipedia.org/wiki/Daniel_Radcliffe,"{'Birthday': '1989-07-23', 'Nickname': 'Daniel...",1989-07-23,Daniel Jacob Radcliffe,"London, England","No, it wasn't found"
9,209112,Batman v Superman: Dawn of Justice,"[{'cast_id': 18, 'character': 'Bruce Wayne / B...","[{""credit_id"": ""553bf23692514135c8002886"", ""de...",Ben Affleck,https://en.wikipedia.org/wiki/Ben_Affleck,"{'Birthday': '1972-08-15', 'Nickname': 'Benjam...",1972-08-15,Benjamin Géza Affleck-Boldt,"Berkeley, California, U.S.","[https://www.imdb.com/title/tt2975590/, https:..."


In [178]:
# 14 rows did not have a successful search
credits_100['IMDB_reviews_url'].value_counts()

No, it wasn't found                                                                                                                                       12
Run into erro                                                                                                                                              2
[https://www.imdb.com/title/tt0816711/reviews, https://www.imdb.com/title/tt0816711/]                                                                      1
[https://www.imdb.com/title/tt1300854/reviews, https://m.imdb.com/title/tt0371746/fullcredits]                                                             1
[https://www.imdb.com/title/tt2379713/reviews, https://www.imdb.com/title/tt2379713/criticreviews]                                                         1
[https://www.imdb.com/title/tt0859163/reviews, https://www.imdb.com/title/tt0859163/]                                                                      1
[https://www.imdb.com/title/tt1440129/reviews, https://www

In [221]:
def get_other_reviews(movie_name):
    try:
        executable_path = {'executable_path':'/users/Aihua Chen/Downloads/chromedriver'}
        browser = Browser('chrome', **executable_path)
        browser.visit('http://google.com')
        browser.fill('q', '{} reviews'.format(movie_name))
        time.sleep(2)
        browser.find_by_name('btnK').click()

        if browser.is_text_present('Critic reviews'):
            html_text = browser.html
            soup = BeautifulSoup(html_text, 'html.parser')
            reviews = []
            for x in soup.find_all('div',{'class',"NIUoNb"}):
                review = x.get_text()
                review_source = x.find("a").attrs.get("href")
                reviews.append({"Review": review, "Source": review_source})
            return reviews
        else:
            return ("No, it does not have critic reviews")

        time.sleep(2)
        browser.quit()
        
    except:
        return "Run into error"

In [218]:
#just an example
get_other_reviews('Avatar')

[{'Review': 'What if the director of the highest-grossing movie ever made (Titanic) spent a rumored $500 million on a spectacular futuristic sci-fi epic and no one other than hardcore fanboys went to see it? Full review',
  'Source': 'https://www.commonsensemedia.org/movie-reviews/avatar'},
 {'Review': "Worth watching for fans, completists and anyone who missed it on the big screen first time around - but it won't win over any haters. Full review",
  'Source': 'https://www.empireonline.com/movies/avatar-special-edition/review/'},
 {'Review': "But Avatar is no Hollywood wankfest. It extends the possibilities of what movies can do. Cameron's talent may just be as big as his dreams. Full review",
  'Source': 'https://www.rollingstone.com/movies/movie-reviews/avatar-251996/'},
 {'Review': "Fifteen years in the making, James Cameron's latest creation is an eye-popping spectacle of conflict between idyllic aliens and greedy humans—saturated with environmental and spiritual themes. Full revie

In [222]:
credits_100['other_reviews'] = credits_100['title'].map(get_other_reviews)

In [233]:
credits_100[credits_100['other_reviews'] == 'Run into error']

Unnamed: 0,movie_id,title,cast,crew,lead_actor_name,URL,WikiData,Birthday,Full Name,Birthplace,IMDB_reviews_url,other_reviews
18,41154,Men in Black 3,"[{'cast_id': 4, 'character': 'Agent J', 'credi...","[{""credit_id"": ""52fe45b7c3a36847f80d68c7"", ""de...",Will Smith,https://en.wikipedia.org/wiki/Will_Smith,"{'Birthday': '1968-09-25', 'Nickname': 'Willar...",1968-09-25,Willard Carroll Smith II,"Philadelphia, Pennsylvania, U.S.",Run into erro,Run into error
33,36668,X-Men: The Last Stand,"[{'cast_id': 4, 'character': 'Logan / Wolverin...","[{""credit_id"": ""538d82720e0a26670e005e83"", ""de...",Hugh Jackman,https://en.wikipedia.org/wiki/Hugh_Jackman,"{'Birthday': '1968-10-12', 'Nickname': 'Hugh M...",1968-10-12,Hugh Michael Jackman,"Sydney, New South Wales, Australia",Run into erro,Run into error


In [250]:
credits_100.to_csv('tmdb_100_credits.csv')