# Scrape movie information from IMDB using python and Beautiful Soup.

This project intends to demonstrate web scraping on imdb.com for retrieving information about movies of various genres using Python, Beautiful Soup, and requests.
Web scraping is a method that automates the process of gathering data from webpages and storing it in appropriate forms. Web scraping allows you to quickly retrieve a significant quantity of information from many websites.
#### Why scraping from IMDB?
If you want to design a movie recommendation engine that recommends movies based on your preferences, you'll need data sets of various movies from various genres, including their name, rating, release year, Metascore, genre, brief description, movie certificate, votes, and so on.
#### Procedures:
- Import the necessary modules.
- Obtaining URLs from various genres.
- Using Beautiful Soup and requests, parse the website with the URL containing movies of various genres.
- Extract information such as the title, genre, year of release, rating, certificate, Metascore, votes, and so on.
- Convert all data into a pandas data frame and save as a CSV file.

#### Import the necessary modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

HEADERS ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

#### Obtaining URLs from various genres.

In [2]:
genres = [
    
    "Adventure",
    "Animation",
    "Biography",
    "Comedy",
    "Crime",
    "Drama",
    "Family",
    "Fantasy",
    "Film-Noir",
    "History",
    "Horror",
    "Music",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Sport",
    "Thriller",
    "War",
    "Western"
]

url_dict = {}

for genre in genres:
    url = "https://www.imdb.com/search/title/?genres={}&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"
    formated_url = url.format(genre)
    url_dict[genre] = formated_url
    
print(url_dict)

{'Adventure': 'https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16', 'Animation': 'https://www.imdb.com/search/title/?genres=Animation&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16', 'Biography': 'https://www.imdb.com/search/title/?genres=Biography&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16', 'Comedy': 'https://www.imdb.com/search/title/?genres=Comedy&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5

#### Using Beautiful Soup and requests, parse the website with the URL containing movies of various genres.

In [3]:
url = "https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"

# Sending a request to the speciifed URL
resp = requests.get(url, headers=HEADERS)

# Converting the response to Beautiful Soup Object
content = BeautifulSoup(resp.content, 'lxml')

# Iterating throught the list of movies 
for movie in content.select('.lister-item-content'):
        
    try:
        # Creating a python dictonary
        data = {
            
            "title":movie.select('.lister-item-header')[0].get_text().strip(),
            "year":movie.select('.lister-item-year')[0].get_text().strip(),
            "certificate":movie.select('.certificate')[0].get_text().strip(),
            "time":movie.select('.runtime')[0].get_text().strip(),
            "genre":movie.select('.genre')[0].get_text().strip(),
            "rating":movie.select('.ratings-imdb-rating')[0].get_text().strip(),
            "metascore":movie.select('.ratings-metascore')[0].get_text().strip(),
            "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
            "votes":movie.select('.sort-num_votes-visible')[0].get_text().strip()
            
                
        }
    except IndexError:
        continue
        
    print(data)

{'title': '1.\nThe Lord of the Rings: The Return of the King\n(2003)', 'year': '(2003)', 'certificate': 'PG', 'time': '201 min', 'genre': 'Action, Adventure, Drama', 'rating': '9.0', 'metascore': '94        \n        Metascore', 'simple_desc': "Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring.", 'votes': 'Votes:\n1,783,014\n| Gross:\n$377.85M'}
{'title': '2.\nInception\n(2010)', 'year': '(2010)', 'certificate': 'PG-13', 'time': '148 min', 'genre': 'Action, Adventure, Sci-Fi', 'rating': '8.8', 'metascore': '74        \n        Metascore', 'simple_desc': 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster.', 'votes': 'Votes:\n2,277,469\n| Gross:\n$292.58M'}
{'title': '3.\nThe Lord of the Rings: The Two Towers\n(2002)', 'year':

{'title': '33.\nHauru no ugoku shiro\n(2004)', 'year': '(2004)', 'certificate': 'PG', 'time': '119 min', 'genre': 'Animation, Adventure, Family', 'rating': '8.2', 'metascore': '80        \n        Metascore', 'simple_desc': 'When an unconfident young woman is cursed with an old body by a spiteful witch, her only chance of breaking the spell lies with a self-indulgent yet insecure young wizard and his companions in his legged, walking castle.', 'votes': 'Votes:\n381,149\n| Gross:\n$4.71M'}
{'title': '35.\nJurassic Park\n(1993)', 'year': '(1993)', 'certificate': 'PG', 'time': '127 min', 'genre': 'Action, Adventure, Sci-Fi', 'rating': '8.2', 'metascore': '68        \n        Metascore', 'simple_desc': "A pragmatic paleontologist touring an almost complete theme park on an island in Central America is tasked with protecting a couple of kids after a power failure causes the park's cloned dinosaurs to run loose.", 'votes': 'Votes:\n950,511\n| Gross:\n$402.45M'}
{'title': '36.\nIndiana Jones 

#### Extract information such as the title, genre, year of release, rating, certificate, Metascore, votes, and so on.

In [4]:
import time

def get_movies(url, interval, file_name):
    
    resp = requests.get(url, headers=HEADERS)
    content = BeautifulSoup(resp.content, 'lxml')

    movie_list = []

    for movie in content.select('.lister-item-content'):
        time.sleep(interval)
        try:
            data = {
                "title":movie.select('.lister-item-header')[0].get_text().strip(),
                "year":movie.select('.lister-item-year')[0].get_text().strip(),
                "certificate":movie.select('.certificate')[0].get_text().strip(),
                "time":movie.select('.runtime')[0].get_text().strip(),
                "genre":movie.select('.genre')[0].get_text().strip(),
                "rating":movie.select('.ratings-imdb-rating')[0].get_text().strip(),
                "metascore":movie.select('.ratings-metascore')[0].get_text().strip(),
                "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
                "votes":movie.select('.sort-num_votes-visible')[0].get_text().strip()
            
                
            }
        except IndexError:
            continue
    
        movie_list.append(data)
         
        
    dataframe = pd.DataFrame(movie_list)
    dataframe.to_csv(file_name)

url = "https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"

# Calling the function
get_movies(url, 0, 'Adventure_movies.csv')

#### Convert all data into a pandas data frame and save as a CSV file.

In [None]:
for genre, url in url_dict.items():
    get_movies(url, 1, genre+'.csv')
    print("Saved:", genre+'.csv')

Saved: Adventure.csv
Saved: Animation.csv
Saved: Biography.csv
Saved: Comedy.csv
Saved: Crime.csv
Saved: Drama.csv
Saved: Family.csv
Saved: Fantasy.csv
Saved: Film-Noir.csv
Saved: History.csv
Saved: Horror.csv
