In [2]:
import pandas as pd
import numpy as np
import re
import lxml
import seaborn as sns

from bs4 import BeautifulSoup
from requests import get
%matplotlib inline

In [3]:
def createURLs(numMovies): 
    """Create and return two lists of URLS given the number of entries desired
        Keyword Arguments:
        numMovies- the number of movies you want to scrape
    """
    
    
    
    url_range_imdb = range(1,numMovies,50) #create imdb range
    url_range_boxoffice = range(1,numMovies,200) #create box office mojo range
    url_imdb = []
    url_boxoffice = []
    for i in url_range_imdb: #create list of imdb urls
        url_imdb.append("https://www.imdb.com/search/title/?title_type=feature&start=" + str(i) + "&ref_=adv_prv")
    for i in url_range_boxoffice: #create list of box offic emojo urls
        if i == 1:
            url_boxoffice.append("https://www.boxofficemojo.com/chart/top_lifetime_gross/")
        else:
            url_boxoffice.append("https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=" + str(i-1))
    #creates a list of the urls to scrape from. Each page shows 50 results and starts from 1
    return (url_imdb, url_boxoffice) # return URLS




In [4]:
#initialize lists
title = []
date = []
genre = []
rating = []
score = []
director = []
stars = []
#loop through each url
url_list = createURLs(10000)
for url in url_list[0]:
    #grab the data from the url, read it into Beauitufl soup, find each movie from the page
    page = get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    content = soup.find(id="main") #grab the main content
    frames = content.find_all("div", class_="lister-item mode-advanced") #grab each movie intsance
    for frame in frames:    
        #grab data from each movie instance
        firstLine = frame.find("h3", class_="lister-item-header")
        title.append(firstLine.find("a").text)
        date.append(re.sub(r"[()]","", firstLine.find_all("span")[-1].text))
        try:
            genre.append(frame.find("span", class_="genre").text.rstrip().replace("\n","").split(","))
        except:
            genre.append('None')
        try:
            rating.append(frame.find("strong").text)
        except:
            rating.append(0.0) #set rating to 0.0 if its not there
        try:
            score.append(frame.find("span", class_="metascore favorable").text.rstrip())
        except:
            try:
                score.append(frame.find("span", class_="metascore unfavorable").text.rstrip())
            except:
                try:
                    score.append(frame.find("span", class_="metascore mixed").text.rstrip())
                except:
                    score.append(0.0) #set score to 0.0 if it doesnt exist
        cast = (frame.find("p", class_=""))
        try:    #split apart the director and the casts
            casts = cast.text.replace("\n","").split('|')
            casts = [x.strip() for x in casts]
            casts = [casts[i].replace(j, "") for i,j in enumerate(["Director:", "Stars:"])]
            director.append(casts[0])
            stars.append([x.strip() for x in casts[1].split(",")])
        except:    #set director to 'No Director' if it isnt there
            casts = cast.text.replace("\n","").strip()
            director.append("No Director")
            stars.append([x.strip() for x in casts.split(",")])

    




In [13]:
#Creating the dataframe
column_names = ['Title','Year','Genre','Rating','Score','Director','Stars'] 
df = pd.DataFrame(list(zip(title,date,genre,rating,score,director,stars)),columns = column_names)





In [14]:
df.loc[df['Title']=='Parasite']

Unnamed: 0,Title,Year,Genre,Rating,Score,Director,Stars
0,Parasite,2019,"[Comedy, Drama, Thriller]",8.6,96,Bong Joon Ho,"[Kang-ho Song, Sun-kyun Lee, Yeo-jeong Jo, Woo..."
4793,Parasite,1982,"[Horror, Sci-Fi]",3.8,0,Charles Band,"[Robert Glaudini, Demi Moore, Luca Bercovici, ..."


In [15]:
df.shape

(10000, 7)

In [16]:

#Scrape box office mojo
titles = []
gross = []
years = []
for url in url_list[1]:
    page = get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    rows = soup.findAll('tr')
    i=0
    for row in rows:
        if i == 0:
            i=1
        else:
            titles.append(row.find(class_='a-link-normal').text)
            gross.append(row.find(class_='a-text-right mojo-field-type-money').text)
            years.append(row.find(class_='a-text-left mojo-field-type-year').text)



In [9]:
#Turn it into a dataframe
df2 = pd.DataFrame(list(zip(titles,gross,years)),columns = ['Title','Gross','Year'])
df2.head()

Unnamed: 0,Title,Gross,Year
0,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015
1,Avengers: Endgame,"$858,373,000",2019
2,Avatar,"$760,507,625",2009
3,Black Panther,"$700,059,566",2018
4,Avengers: Infinity War,"$678,815,482",2018


In [17]:
merged_df= pd.merge(left = df, right = df2, on = ['Title','Year'])

In [18]:
merged_df.shape
merged_df.head()


Unnamed: 0,Title,Year,Genre,Rating,Score,Director,Stars,Gross
0,Parasite,2019,"[Comedy, Drama, Thriller]",8.6,96,Bong Joon Ho,"[Kang-ho Song, Sun-kyun Lee, Yeo-jeong Jo, Woo...","$51,479,136"
1,The Invisible Man,2020,"[Horror, Mystery, Sci-Fi]",7.6,71,Leigh Whannell,"[Elisabeth Moss, Oliver Jackson-Cohen, Harriet...","$28,205,665"
2,Knives Out,2019,"[Comedy, Crime, Drama]",8.0,82,Rian Johnson,"[Daniel Craig, Chris Evans, Ana de Armas, Jami...","$164,655,243"
3,Sonic the Hedgehog,2020,"[Action, Adventure, Comedy]",6.8,47,Jeff Fowler,"[Ben Schwartz, James Marsden, Jim Carrey, Tika...","$128,555,045"
4,Jojo Rabbit,2019,"[Comedy, Drama, War]",8.0,58,Taika Waititi,"[Roman Griffin Davis, Thomasin McKenzie, Scarl...","$33,122,789"


In [19]:
merged_df.to_csv('movieData.csv', index = False)

Unnamed: 0,Title,Year,Genre,Rating,Score,Director,Stars,Gross
0,Parasite,2019,"['Comedy', ' Drama', ' Thriller']",8.6,96.0,Bong Joon Ho,"['Kang-ho Song', 'Sun-kyun Lee', 'Yeo-jeong Jo...","$51,479,136"
1,The Invisible Man,2020,"['Horror', ' Mystery', ' Sci-Fi']",7.6,71.0,Leigh Whannell,"['Elisabeth Moss', 'Oliver Jackson-Cohen', 'Ha...","$28,205,665"
2,Knives Out,2019,"['Comedy', ' Crime', ' Drama']",8.0,82.0,Rian Johnson,"['Daniel Craig', 'Chris Evans', 'Ana de Armas'...","$164,655,243"
3,Sonic the Hedgehog,2020,"['Action', ' Adventure', ' Comedy']",6.8,47.0,Jeff Fowler,"['Ben Schwartz', 'James Marsden', 'Jim Carrey'...","$128,555,045"
4,Jojo Rabbit,2019,"['Comedy', ' Drama', ' War']",8.0,58.0,Taika Waititi,"['Roman Griffin Davis', 'Thomasin McKenzie', '...","$33,122,789"
