# IMDB Scraping

In [None]:
from requests import get

url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'

response = get(url)
print(response.text[:500])

In [None]:
from IPython.display import Image
Image(filename='img/img5.png', width=500, height=300)

## Prepare URL And Libraries

In [None]:
from requests import get
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)

html_soup = BeautifulSoup(response.text, 'html.parser')

## Get First Movie

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

first_movie = movie_containers[0]
print(first_movie.prettify())

## Get Title

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

first_movie = movie_containers[0]
movie_title = first_movie.h3.a.text
movie_title

## Year of Movie Release

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

first_movie = movie_containers[0]
movie_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
movie_year

## IMDB Rating

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

first_movie = movie_containers[0]
movie_imdb = float(first_movie.strong.text)
movie_imdb

## IMDB Score

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

first_movie = movie_containers[0]
movie_mscore = first_movie.find('span', class_ = 'metascore favorable')

movie_mscore = int(movie_mscore.text)
print(movie_mscore)

## Number Of Votes

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

first_movie = movie_containers[0]
movie_votes = first_movie.find('span', attrs = {'name':'nv'})
print(movie_votes.text)

## Put It All Together

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Extract data from individual movie container
for container in movie_containers:

    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:

        # The name
        name = container.h3.a.text
        names.append(name)

        # The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)

        # The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)

        # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))

        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

print(names)
print(years)
print(imdb_ratings)
print(metascores)
print(votes)

## Put Into DataFrame

In [None]:
import pandas as pd

movie = pd.DataFrame({'movie': names,
                       'year': years,
                       'imdb': imdb_ratings,
                       'metascore': metascores,
                       'votes': votes})
print(movie.info())
movie.head()

# Save Into CSV

In [None]:
import csv

# Open/Create a file to append data
csvFile = open('imdb_movies.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)

for index, row in movie.iterrows():
    csvWriter.writerow(row)
    
    
print(movie.info())
movie.head()