# Web Scrapping dengan Request dan Beautiful Soap

## Scraping data film

In [15]:
import requests
from bs4 import BeautifulSoup
# Mendapatkan page HTML melalui request
halaman = requests.get('https://www.imdb.com/chart/top/',
headers={'User-Agent': 'Mozilla/5.0'})
# Parsing konten halaman menggunakan beautifulsoup
halaman_parsed = BeautifulSoup(halaman.content, 'html.parser')
juduls = halaman_parsed.select("ul li a.ipc-title-link-wrapper h3")

movies = halaman_parsed.select("ul li.ipc-metadata-list-summary-item")
hasil_data = []
for movie in movies:
    judul = movie.select('a.ipc-title-link-wrapper h3')[0].get_text()
    tahun = movie.select('div.cli-title-metadata span')[0].get_text()
    durasi = movie.select('div.cli-title-metadata span')[1].get_text()
    rating = movie.select('span.ipc-rating-star')[0].get("aria-label").strip('IMDb rating: ') #mengambil atribut tag aria-label
    hasil_data.append({"judul": judul, "tahun": tahun, "durasi": durasi,"rating": rating})

[{'judul': '1. The Shawshank Redemption', 'tahun': '1994', 'durasi': '2h 22m', 'rating': '9.3'}, {'judul': '2. The Godfather', 'tahun': '1972', 'durasi': '2h 55m', 'rating': '9.2'}, {'judul': '3. The Dark Knight', 'tahun': '2008', 'durasi': '2h 32m', 'rating': '9.0'}, {'judul': '4. The Godfather: Part II', 'tahun': '1974', 'durasi': '3h 22m', 'rating': '9.0'}, {'judul': '5. 12 Angry Men', 'tahun': '1957', 'durasi': '1h 36m', 'rating': '9.0'}, {'judul': "6. Schindler's List", 'tahun': '1993', 'durasi': '3h 15m', 'rating': '9.0'}, {'judul': '7. The Lord of the Rings: The Return of the King', 'tahun': '2003', 'durasi': '3h 21m', 'rating': '9.0'}, {'judul': '8. Pulp Fiction', 'tahun': '1994', 'durasi': '2h 34m', 'rating': '8.9'}, {'judul': '9. The Lord of the Rings: The Fellowship of the Ring', 'tahun': '2001', 'durasi': '2h 58m', 'rating': '8.9'}, {'judul': '10. The Good, the Bad and the Ugly', 'tahun': '1966', 'durasi': '2h 41m', 'rating': '8.8'}, {'judul': '11. Forrest Gump', 'tahun': '

## Import Poster ke dalam folder imdb-images

In [17]:
from os.path import basename
movies = halaman_parsed.select("ul li.ipc-metadata-list-summary-item")
hasil_data = []
for movie in movies:
  judul = movie.select('a.ipc-title-link-wrapper h3')[0].get_text()
  tahun = movie.select('div.cli-title-metadata span')[0].get_text()
  durasi = movie.select('div.cli-title-metadata span')[1].get_text()
  rating = movie.select('span.ipc-rating-star')[0].get("aria-label").strip('IMDb rating: ')
  poster = movie.select('img.ipc-image')[0].get("src")
  hasil_data.append({"judul": judul, "tahun": tahun, "durasi": durasi, "rating": rating, "poster": poster})
  
  with open('./content/imdb-images/' +basename(poster), 'wb') as f:
    f.write(requests.get(poster).content)

## Import data scraping dalam bentuk CSV

In [19]:
import pandas as pd

hasil_data_df = pd.DataFrame(hasil_data)
hasil_data_df.to_csv('./content/top_movies.csv', index=False)

# Web Scraping dengan Scrappy

## Memindahkan active directory

In [25]:
import scrapy
import os
os.chdir('./scrapy-imdb')

FileNotFoundError: [WinError 2] The system cannot find the file specified: './scrapy-imdb'

## Mengecek Active directory

In [33]:
pwd

'd:\\Latihan\\Latihan\\.vscode\\Python\\project-based-learning-python\\Scraping Movie\\scrapy-imdb'

## Membuat Projek Scrappy

In [28]:
!scrapy startproject scraping_movies

New Scrapy project 'scraping_movies', using template directory 'C:\Users\ASUS\AppData\Roaming\Python\Python39\site-packages\scrapy\templates\project', created in:
    D:\Latihan\Latihan\.vscode\Python\project-based-learning-python\Scraping Movie\scrapy-imdb\scraping_movies

You can start your first spider with:
    cd scraping_movies
    scrapy genspider example example.com


## Membuat file spiders baru

In [34]:
os.chdir('./scraping_movies/scraping_movies/spiders/')
!scrapy genspider movies https://www.imdb.com/chart/top/

Created spider 'movies' using template 'basic' in module:
  scraping_movies.spiders.movies


## Menjalankan Scraping

In [35]:
!scrapy crawl movies

2024-02-06 20:34:19 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scraping_movies)
2024-02-06 20:34:20 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1o  3 May 2022), cryptography 36.0.0, Platform Windows-10-10.0.19042-SP0
2024-02-06 20:34:20 [scrapy.addons] INFO: Enabled addons:
[]
2024-02-06 20:34:20 [asyncio] DEBUG: Using selector: SelectSelector
2024-02-06 20:34:20 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2024-02-06 20:34:20 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.windows_events._WindowsSelectorEventLoop
2024-02-06 20:34:20 [scrapy.extensions.telnet] INFO: Telnet Password: a805f05049349e20
2024-02-06 20:34:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.

## Import hasil scraping dalam bentuk csv

In [36]:
!scrapy crawl movies -o top_movies.csv

2024-02-06 20:35:29 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scraping_movies)
2024-02-06 20:35:29 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1o  3 May 2022), cryptography 36.0.0, Platform Windows-10-10.0.19042-SP0
2024-02-06 20:35:29 [scrapy.addons] INFO: Enabled addons:
[]
2024-02-06 20:35:29 [asyncio] DEBUG: Using selector: SelectSelector
2024-02-06 20:35:29 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2024-02-06 20:35:29 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.windows_events._WindowsSelectorEventLoop
2024-02-06 20:35:29 [scrapy.extensions.telnet] INFO: Telnet Password: 78f8384f21ee40ae
2024-02-06 20:35:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.

# Web Scraping dengan Selenium

In [37]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
service = Service(executable_path=r'./chromedriver_win32/chromedriver.exe')
options = webdriver.ChromeOptions()
options.add_argument('--headless') # menggunakan chrome tanpa
GUI
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=service, options=options)

ModuleNotFoundError: No module named 'selenium'