# IMDb
    IMDb (Internet Movie Database) is an online database of information related to films,
    television programs, home videos, video games, and streaming content online – including cast,
    production crew and personal biographies, plot summaries, trivia, ratings, and fan and 
    critical reviews.

## Importing Libraries

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Sending request to the IMDb website

In [17]:
# setting the url
url = "https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc"

# sending request to the url
res_obj = requests.get(url)

# checking the request status
print(f"Request Status: {res_obj.status_code}")

# checking the response html content 
data = res_obj.text
data

Request Status: 200


'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>IMDb &quot;Top 250&quot;\n(Sorted by IMDb Rating Descending) - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>

## Parsing the Data using BeautifulSoup

In [18]:
soup = BeautifulSoup(data, 'html.parser')
print(soup)


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>IMDb "Top 250"
(Sorted by IMDb Rating Descending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="

In [22]:
# finding an HTML element from the data

header = soup.find('h1', {'class' : 'header'})

# fetching the entire element
print(header)

# fetching the text inside the element
header.text

<h1 class="header">IMDb "Top 250"
(Sorted by IMDb Rating Descending) </h1>


'IMDb "Top 250"\n(Sorted by IMDb Rating Descending) '

In [23]:
## Creating a List of all the movie data\
data_divs = soup.find_all('div', {'class' : 'lister-item mode-advanced'})

data_list = list()
for data_div in data_divs:
    data_dict = dict()
    data_dict['Rank'] =  data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[0]
    data_dict['Movie'] = data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[1]
    data_dict['Year'] = data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[2].replace('(', '').replace(')', '')
    try:
        data_dict['Certificate'] = data_div.find('span', {'class' : 'certificate'}).text.strip()
    except:
        data_dict['Certificate'] = None
    data_dict['Runtime'] = data_div.find('span', {'class' : 'runtime'}).text.strip().replace('\n', '')
    data_dict['Genre'] = data_div.find('span', {'class' : 'genre'}).text.strip()
    data_dict['Rating'] = data_div.find('strong').text.strip()
    data_list.append(data_dict)
    
data_list

[{'Rank': '1.',
  'Movie': 'The Shawshank Redemption',
  'Year': '1994',
  'Certificate': 'A',
  'Runtime': '142 min',
  'Genre': 'Drama',
  'Rating': '9.3'},
 {'Rank': '2.',
  'Movie': 'The Godfather',
  'Year': '1972',
  'Certificate': 'A',
  'Runtime': '175 min',
  'Genre': 'Crime, Drama',
  'Rating': '9.2'},
 {'Rank': '3.',
  'Movie': 'The Dark Knight',
  'Year': '2008',
  'Certificate': 'UA',
  'Runtime': '152 min',
  'Genre': 'Action, Crime, Drama',
  'Rating': '9.0'},
 {'Rank': '4.',
  'Movie': 'The Godfather: Part II',
  'Year': '1974',
  'Certificate': 'A',
  'Runtime': '202 min',
  'Genre': 'Crime, Drama',
  'Rating': '9.0'},
 {'Rank': '5.',
  'Movie': 'The Lord of the Rings: The Return of the King',
  'Year': '2003',
  'Certificate': '16',
  'Runtime': '201 min',
  'Genre': 'Adventure, Drama, Fantasy',
  'Rating': '8.9'},
 {'Rank': '6.',
  'Movie': 'Pulp Fiction',
  'Year': '1994',
  'Certificate': 'A',
  'Runtime': '154 min',
  'Genre': 'Crime, Drama',
  'Rating': '8.9'},
 

## Creating a Pandas DataFrame

In [24]:
df = pd.DataFrame(data_list)
df

Unnamed: 0,Rank,Movie,Year,Certificate,Runtime,Genre,Rating
0,1.0,The Shawshank Redemption,1994,A,142 min,Drama,9.3
1,2.0,The Godfather,1972,A,175 min,"Crime, Drama",9.2
2,3.0,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0
3,4.0,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0
4,5.0,The Lord of the Rings: The Return of the King,2003,16,201 min,"Adventure, Drama, Fantasy",8.9
5,6.0,Pulp Fiction,1994,A,154 min,"Crime, Drama",8.9
6,7.0,Schindler's List,1993,A,195 min,"Biography, Drama, History",8.9
7,8.0,12 Angry Men,1957,,96 min,"Crime, Drama",8.9
8,9.0,Inception,2010,UA,148 min,"Action, Adventure, Sci-Fi",8.8
9,10.0,Fight Club,1999,A,139 min,Drama,8.8


In [None]:
# saving the df

df.to_excel('Top 50 MOvies with IMDb Rating.xlsx')

## Scraping all 250 movies

In [25]:
params_list = ['1', '51', '101', '151', '201']

data_list = list()
for start in params_list:
    url = "https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc"
    params = {
        "start" : start
    }
    print(start)
    res_obj = requests.get(url, params=params)
    data = res_obj.text
    soup = BeautifulSoup(data, 'html.parser')
    
    data_divs = soup.find_all('div', {'class' : 'lister-item mode-advanced'})


    for data_div in data_divs:
        data_dict = dict()
        data_dict['Rank'] =  data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[0]
        data_dict['Movie'] = data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[1]
        data_dict['Year'] = data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[2].replace('(', '').replace(')', '')
        try:
            data_dict['Certificate'] = data_div.find('span', {'class' : 'certificate'}).text.strip()
        except:
            data_dict['Certificate'] = None
        data_dict['Runtime'] = data_div.find('span', {'class' : 'runtime'}).text.strip().replace('\n', '')
        data_dict['Genre'] = data_div.find('span', {'class' : 'genre'}).text.strip()
        data_dict['Rating'] = data_div.find('strong').text.strip()
        data_list.append(data_dict)


df = pd.DataFrame(data_list)
df

1
51
101
151
201


Unnamed: 0,Rank,Movie,Year,Certificate,Runtime,Genre,Rating
0,1.,The Shawshank Redemption,1994,A,142 min,Drama,9.3
1,2.,The Godfather,1972,A,175 min,"Crime, Drama",9.2
2,3.,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0
3,4.,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0
4,5.,The Lord of the Rings: The Return of the King,2003,16,201 min,"Adventure, Drama, Fantasy",8.9
...,...,...,...,...,...,...,...
245,246.,Before Sunset,2004,R,80 min,"Drama, Romance",8.0
246,247.,"Monsters, Inc.",2001,G,92 min,"Animation, Adventure, Comedy",8.0
247,248.,Aladdin,1992,U,90 min,"Animation, Adventure, Comedy",8.0
248,249.,The Terminator,1984,A,107 min,"Action, Sci-Fi",8.0


In [None]:
df.to_excel('Top 250 MOvies with IMDb Rating.xlsx')

# Selenium

## What is Selenium?
    Selenium is a Web Browser Automation Tool.
    Primarily, it is for automating web applications for testing purposes,
    but is certainly not limited to just that.
    It allows you to open a browser of your choice & perform tasks as a human being would, such as:
        - Clicking buttons
        - Entering information in forms
        - Searching for specific information on the web pages
     
### Installing selenium:
    pip install selenium
        
## What is Geckodriver?
    GeckoDriver is a web browser engine which is used in many applications
    developed by Mozilla Foundation and the Mozilla Corporation.
    GeckoDriver is the link between your tests in Selenium and the Firefox browser.
    
    The main purpose of the GeckoDriver/ChromeDriver is to launch FireFox/Google Chrome.

### Installing Geckodriver / Chromedriver
    Link: https://www.seleniumeasy.com/python/getting-started-selenium-webdriver-using-python

In [26]:
from selenium import webdriver

browser = webdriver.Firefox()
browser.get("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc")
data = browser.page_source
browser.close()
data

'<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" class=" scriptsOn"><head><script async="" src="https://images-na.ssl-images-amazon.com/images/I/31BVuidgT8L.js" crossorigin="anonymous"></script><script async="" src="https://m.media-amazon.com/images/G/01/imdbads/custom/test/index/js/ad-plugins/showadv2.js" crossorigin="anonymous"></script>\n        \n<script type="text/javascript">var ue_t0=ue_t0||+new Date();</script>\n<script type="text/javascript">\nwindow.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;\nif (window.ue_ihb === 1) {\n\nvar ue_csm = window,\n    ue_hob = +new Date();\n(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){return+new Date};e.d=function(b){return f()-(b?0:d.ue_t0)};e.stub=function(b,a){if(!b[a]){var c=[];b[a]=function(){c.push([c.slice.call(arguments),e.d(),d.ue_id])};b[a].replay=function(b){for(var a;a=c.shift();)b(a[0],a[1],a[2])};b[a].isStub=1}};e.exec=function(b,a){return function(){try{return b.apply(this,argumen

In [8]:
soup = BeautifulSoup(data, 'html.parser')
data_divs = soup.find_all('div', {'class' : 'lister-item mode-advanced'})


data_list = list()
for data_div in data_divs:
    data_dict = dict()
    data_dict['Rank'] =  data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[0]
    data_dict['Movie'] = data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[1]
    data_dict['Year'] = data_div.find('h3', {'class' : 'lister-item-header'}).text.strip().split('\n')[2].replace('(', '').replace(')', '')
    try:
        data_dict['Certificate'] = data_div.find('span', {'class' : 'certificate'}).text.strip()
    except:
        data_dict['Certificate'] = None
    data_dict['Runtime'] = data_div.find('span', {'class' : 'runtime'}).text.strip().replace('\n', '')
    data_dict['Genre'] = data_div.find('span', {'class' : 'genre'}).text.strip()
    data_dict['Rating'] = data_div.find('strong').text.strip()
    data_list.append(data_dict)
    
df = pd.DataFrame(data_list)
df

Unnamed: 0,Rank,Movie,Year,Certificate,Runtime,Genre,Rating
0,1.0,The Shawshank Redemption,1994,R,142 min,Drama,9.3
1,2.0,The Godfather,1972,R,175 min,"Crime, Drama",9.2
2,3.0,The Dark Knight,2008,PG-13,152 min,"Action, Crime, Drama",9.0
3,4.0,The Godfather: Part II,1974,R,202 min,"Crime, Drama",9.0
4,5.0,The Lord of the Rings: The Return of the King,2003,PG-13,201 min,"Adventure, Drama, Fantasy",8.9
5,6.0,Pulp Fiction,1994,R,154 min,"Crime, Drama",8.9
6,7.0,Schindler's List,1993,R,195 min,"Biography, Drama, History",8.9
7,8.0,12 Angry Men,1957,Approved,96 min,"Crime, Drama",8.9
8,9.0,Inception,2010,PG-13,148 min,"Action, Adventure, Sci-Fi",8.8
9,10.0,Fight Club,1999,R,139 min,Drama,8.8


## Writing Selenium Scripts

In [27]:
import time

browser = webdriver.Firefox()
browser.get("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc")
time.sleep(5)

element = browser.find_element_by_xpath("//div[@class='ipc-button__text']")
element.click()