# Scraping Movie Information
Extract movie names and IMDB ID

In [3]:
import pandas as pd
import json
import omdb
import re
from datetime import datetime

from selenium import webdriver
from bs4 import BeautifulSoup

**Set path for PhantomJS lightweight browser, as well as window size to be used**

In [4]:
driver = webdriver.PhantomJS(executable_path='C:/phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.set_window_size(1024,768)

** Create a function to set the URL of the page to be scraped **

In [5]:
def set_url(URL):
    driver.get(URL)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

** This will be the main scraping function**
1. Find the column in the imdb website with the div called 'list compact'
2. Look for the two pieces of information needed (names & id)
3. Append the two generated lists into a dataframe

In [6]:
def scrape_panel(soup):
    col = soup.find('div', class_='list compact')
    
    names = []
    imdbID = []

    #------------------------------------------------------------#
    for n in col.find_all("td",class_="title"):
        try:
            names.append(n.text.encode('ascii','ignore'))
        except:
            names.append(None)
    #------------------------------------------------------------#
    for i in col.find_all("td",class_="title"):
        try:
            imdbID.append(i.find_all('a')[0])
        except:
            imdbID.append(None)
    #------------------------------------------------------------#
    data = pd.DataFrame({'name': names, 'id': imdbID})
    #------------------------------------------------------------#
    return data

**Generate links to scrape through**

Found that the website lists the movies in increments of 250, so a generated list of links was made. It will generate the page extraction for up to 10,000 movies which is about how many movies are in the linked page.

In [7]:
links = []

In [8]:
for link in range (1,10000,250):
    links.append('http://www.imdb.com/list/ls057823854/?start='+str(link)+'&view=compact&sort=listorian:asc')

**Scrape through all the generated links and save the file **

In [21]:
data = pd.DataFrame()

startTime = datetime.now()
counter = 1

for link in links:
    temp = scrape_panel(set_url(link))
    data = pd.concat([data,temp], ignore_index=True)
    print "Finished page",counter
    counter+=1
    
print "SCRIPT RUNTIME :",datetime.now() - startTime
file_name = '../project-6-apis-randomforests/data.csv'
data.to_csv(file_name, index = False, encoding='utf-8')

Finished page 1
Finished page 2
Finished page 3
Finished page 4
Finished page 5
Finished page 6
Finished page 7
Finished page 8
Finished page 9
Finished page 10
Finished page 11
Finished page 12
Finished page 13
Finished page 14
Finished page 15
Finished page 16
Finished page 17
Finished page 18
Finished page 19
Finished page 20
Finished page 21
Finished page 22
Finished page 23
Finished page 24
Finished page 25
Finished page 26
Finished page 27
Finished page 28
Finished page 29
Finished page 30
Finished page 31
Finished page 32
Finished page 33
Finished page 34
Finished page 35
Finished page 36
Finished page 37
Finished page 38
Finished page 39
Finished page 40
SCRIPT RUNTIME : 0:07:20.359000


In [35]:
re.findall(r'[t]*[t]\d+',str(data['id'][0]))

['tt0110912']

In [37]:
data['id'] = data['id'].astype(str)

** Create a function to parse out all the extra characters pulled from the id **

In [59]:
def get_id(string):
    return re.findall(r'[t]*[t]\d+', string)[0]

In [60]:
data['id'] = data['id'].apply(get_id)

In [61]:
data.head()

Unnamed: 0,id,name
0,tt0110912,Pulp Fiction
1,tt1872181,The Amazing Spider-Man 2
2,tt0111161,The Shawshank Redemption
3,tt0076759,Star Wars: Episode IV - A New Hope
4,tt0088763,Back to the Future
