# Scraping from IMDB and Box Office Mojo

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

## Gather individual movie URLs from the Top 1000 list 
Note that IMDB and Box Offie Mojo 'share' urls for individual movies, so the scraping was relatively straightforward on that end.

In [None]:
df = pd.DataFrame()
pos = 1
urls=[]
while pos<1000: 
    urlBase = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=' + str(pos) +'&ref_=adv_nxt'
    #things below!
    response = requests.get(urlBase)
    if response.status_code != 200:
        continue
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    #soup set-up stuff above
    links = soup.find_all('a', href=True)
    for i in links:
        if re.fullmatch('^\/title\/tt[0-9]+\/', i['href']) and i.text.strip()!='':
            #print(i['href']+': '+i.text)
            urls.append(i['href'])
    pos+=100
    time.sleep(np.random.rand()*1.5) #kinda arbitrary amount of time on (0,1.5) in seconds

In [None]:
#Doing this with a try-except 
#choice of while loop is because I would like to make sure that the code tries again if the response failed
#hence the continue
j=0
while j<(len(urls)):
    try:
        title,rating,runtime,budget,color,month,mpaa,genres='','','','','','','','' #initializing all as empty strings
        imdbURL = 'https://www.imdb.com' + urls[j]
        mojoURL = 'https://www.boxofficemojo.com' + urls[j]
        response = requests.get(imdbURL)
        if response.status_code !=200:
            continue
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        title = soup.find('h1').text.replace(u'\xa0',' ').strip()
        year = title[-5:-1]
        title = title[0:-7]
        print(title) #temp
        rating = soup.find("span",itemprop='ratingValue').text
        times = soup.find_all('time')
        for i in times:
            if re.match('^[0-9]+ min',i.text.strip()):
                runtime = i.text.strip()
                runtime = runtime.replace(" min","")
        budget,currency,color = findBudgetBW(soup)
        month = findMonth(soup)
        #Mojo below
        response = requests.get(mojoURL)
        if response.status_code !=200:
            continue
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        mpaa,genres = findMPAAGenre(soup)
        #input into df below
        df = df.append({'Title':title,'Rating':rating,'Runtime':runtime,'Budget':currency,'Currency':budget,'BW or Color':color,
                       'Month of Release':month,'MPAA':mpaa,'Genre':genres,'Year':year}, ignore_index = True)
    except:
        #just spit out current state of variables if something fails
        quickTest = [title,rating,runtime,budget,color,month,mpaa,genres]
        for i in quickTest:
            if i is not None:
                print(i)
    time.sleep(np.random.rand()*1.5)
    j+=1

In [None]:
#helper functions
def findBudgetBW(soup): 
    budget,currency,color = '','',''
    searchBase = soup.find_all("div",class_="txt-block")
    for i in searchBase:
        if "Budget" in i.text:
            budget= i.text.replace(u'\n','').replace('Budget:','').replace('(estimated)','').replace(',','').strip()
            currency = int(''.join(filter(str.isdigit, budget)))
            budget = budget.replace(str(currency),'')
        if "Color" in i.text:
            color = i.text.replace(u'\n','').replace('Color:','').strip()
            colorIndex = color.find("Color")
            bwIndex = color.find("Black and White")
            if colorIndex==-1 and bwIndex==-1:
                color = ""
            elif (bwIndex==-1 or colorIndex<bwIndex) and colorIndex!=-1:
                color = "Color"
            elif (colorIndex==-1 or bwIndex<colorIndex) and bwIndex!=-1:
                color = "Black and White"
    return budget,currency,color
 

def findMonth(soup):
    month = ''
    searchBase = soup.find_all('span',class_='attribute')
    for i in searchBase:
        if re.match('^[0-9]{1,2} [a-zA-Z]+ [0-9]{4}',i.text):
            month = re.search('[a-zA-Z]+',i.text)
            return month.group(0)
    if month =='':
        searchBase = soup.find_all('div', class_ = 'txt-block')
        for i in searchBase:
            match = re.search('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}',i.text)
            if match:
                string = i.text.replace("Release Date:","").strip() #maybe just add the release date string as condition?
                month = re.search('[a-zA-Z]+',string)
                return month.group(0)

            
def findMPAAGenre(soup):
    mpaa=''
    genre=''
    for i in soup.find_all("div", class_="a-section a-spacing-none"):
        if 'MPAA' in i.text:
            mpaa = i.text.replace('MPAA','').strip()
        if 'Genres' in i.text:
            genre = i.text.replace('Genres','').replace(u'\n',"").replace(' ','').strip()
    return mpaa,genre

In [None]:
#loading from file
#in retrospect, could pickle these but whatever
df = pd.to_csv('dataFinal.csv')
#df = pd.read_csv('dataFinal.csv')