In [None]:
!pip install selenium

In [None]:
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import requests
import numpy as np
import urllib.parse

## Selenium Driver

### IMDB Movies

Selenium used to click through the pages and choose the elements that are wanted for scraping

In [9]:
######## SELENIUM

# Create driver and then use to open the webpage
driver = webdriver.Chrome("C:/Chrome/chromedriver.exe")
driver.get("https://www.imdb.com")

driver.maximize_window()

dropdown = driver.find_element_by_class_name('ipc-icon--arrow-drop-down')
dropdown.click()

# Click on advanced search
element = driver.find_element_by_link_text("Advanced Search")
element.click()

# Click on advanced title search
adv_title = driver.find_element_by_link_text("Advanced Title Search")
adv_title.click()

# Select feature film
feature_film = driver.find_element_by_id("title_type-1")
feature_film.click()

# Select tv film
tv_film = driver.find_element_by_id("title_type-2")
tv_film.click()

# Min release date
min_date = driver.find_element_by_name('release_date-min')
min_date.click()
min_date.send_keys('1990')

# Max release date
max_date = driver.find_element_by_name('release_date-max')
max_date.click()
max_date.send_keys('2020')

# User rating (import the select class from selenium)
rating_min = driver.find_element_by_name('user_rating-min')
rating_min.click()
dropdown2 = Select(rating_min)
dropdown2.select_by_visible_text('1.0')

rating_max = driver.find_element_by_name('user_rating-max')
rating_max.click()
dropdown3 = Select(rating_max)
dropdown3.select_by_visible_text('10')

# Oscar nominated
oscar_nominated = driver.find_element_by_id('groups-7')
oscar_nominated.click()

# Colour information
colours = driver.find_element_by_id('colors-1')
colours.click()

# Language english (no click needed)
language = driver.find_element_by_name('languages')
dropdown4 = Select(language)
dropdown4.select_by_visible_text('English')

# Display options (250 per page)
results_count = driver.find_element_by_id('search-count')
dropdown5 = Select(results_count)
dropdown5.select_by_index(2)

# Click the search button
search = driver.find_element_by_xpath('(//button[@type = "submit"])[2]')
search.click()

current_url = driver.current_url

############ BEAUTIFUL SOUP

# Get webpage as a response object
response = requests.get(current_url)

# Create soup object with html parser
soup = BeautifulSoup(response.content, "html.parser")

# Get all the movie details
results = soup.find_all('div',{'class':'lister-item'})

title = []
year = []
duration = []
genre = []
rating = []

for result in results:
    title.append(result.find('h3').find('a').get_text())
    year.append(result.find('h3').find('span',{'class':'lister-item-year'}).get_text().replace("(","").replace(")",""))
    duration.append(result.find('span',{'class':'runtime'}).get_text())
    genre.append(result.find('span',{'class':'genre'}).get_text().strip())
    rating.append(result.find('div',{'class':'inline-block ratings-imdb-rating'}).get_text().strip())
    
movies = pd.DataFrame({'title':title,'year':year,'duration':duration,'genre':genre,'rating':rating})
movies              

Unnamed: 0,title,year,duration,genre,rating
0,The Matrix,1999,136 min,"Action, Sci-Fi",8.7
1,News of the World,2020,118 min,"Action, Adventure, Drama",6.8
2,Once Upon a Time... In Hollywood,2019,161 min,"Comedy, Drama",7.6
3,Harry Potter and the Philosopher's Stone,2001,152 min,"Adventure, Family, Fantasy",7.6
4,Knives Out,2019,130 min,"Comedy, Crime, Drama",7.9
...,...,...,...,...,...
245,The Trial of the Chicago 7,2020,129 min,"Drama, History, Thriller",7.8
246,A Beautiful Mind,2001,135 min,"Biography, Drama",8.2
247,Munich,2005,164 min,"Action, Drama, History",7.5
248,BlacKkKlansman,2018,135 min,"Biography, Comedy, Crime",7.5


### Wikipedia

### Use X-path to get the table

In [7]:
website = 'https://en.wikipedia.org/wiki/List_of_future_tallest_buildings'

# Set driver
driver = webdriver.Chrome("C:/Chrome/chromedriver.exe")
driver.get(website)
driver.maximize_window()

# Make empty dataframe to fill with the locator lists
tallest_buildings = pd.DataFrame(columns = ["Building","City","Country","Floors","Year of Completion"])

# Find locators
building = driver.find_elements_by_xpath('//table[@class ="wikitable sortable jquery-tablesorter"][1]/tbody/tr/td[2]')
city = driver.find_elements_by_xpath('//table[@class ="wikitable sortable jquery-tablesorter"][1]/tbody/tr/td[3]')
country = driver.find_elements_by_xpath('//table[@class ="wikitable sortable jquery-tablesorter"][1]/tbody/tr/td[4]')
height = driver.find_elements_by_xpath('//table[@class ="wikitable sortable jquery-tablesorter"][1]/tbody/tr/td[5]')
floors = driver.find_elements_by_xpath('//table[@class ="wikitable sortable jquery-tablesorter"][1]/tbody/tr/td[6]')
completion_year = driver.find_elements_by_xpath('//table[@class ="wikitable sortable jquery-tablesorter"][1]/tbody/tr/td[7]')

# append to a dataframe
for i in range(len(building)):
    tallest_buildings = tallest_buildings.append({'Building':building[i].text, 'City': city[i].text, 'Country':country[i].text, 'Height(m)':height[i].text, 
                              'Floors':floors[i].text, 'Year of Completion':completion_year[i].text},ignore_index = True)

In [8]:
tallest_buildings

Unnamed: 0,Building,City,Country,Floors,Year of Completion,Height(m)
0,Jeddah Tower,Jeddah,Saudi Arabia,"3,307 ft",167,1008 m
1,Burj Mubarak al-Kabir,"Subiya, Kuwait",Kuwait,"3,284 ft",234,1001 m
2,Dubai Creek Tower,Dubai,UAE,"2,717 ft",27,828 m
3,Tradewinds Square,Kuala Lumpur,Malaysia,"2,542 ft",150,775 m
4,Dubai One Tower,Dubai,UAE,"2,333 ft",161,711 m
...,...,...,...,...,...,...
78,Nanjing Olympic Suning Tower,Nanjing,China,"1,379 ft",99,420 m
79,Ningbo Central Plaza Tower 1,Ningbo,China,"1,342 ft",80,409 m
80,Eye of Spring Trade Center 1,Kunming,China,"1,336 ft",100,407 m
81,Bein Arim Tower,Tel Aviv,Israel,"1,312 ft",100,400 m
