### Scrapy

In [1]:
import pandas as pd
import scrapy

In [2]:
from scrapy.crawler import CrawlerProcess


class StandingsSpider(scrapy.Spider):
    name = "standings"
    start_urls = ["https://www.formula1.com/en/results.html/2022/drivers.html" ]

    def parse(self, response):
          table = response.xpath('//*[contains(@class,"resultsarchive-table")]//tbody//tr')
      
          for standing in table:


            yield {
              'position': standing.xpath('td//text()')[0].extract(),
              'name': standing.xpath('td//text()')[3].extract(),
              'lastname': standing.xpath('td//text()')[5].extract(),
              'alias': standing.xpath('td//text()')[7].extract(),
              'country': standing.xpath('td//text()')[10].extract(),
              'car': standing.xpath('td//text()')[12].extract(),
              'points': standing.xpath('td//text()')[14].extract(),
            }
            

FILE_NAME = 'f1_standings_.csv'
SETTINGS = {
            'FEED_FORMAT': 'csv',
            'FEED_URI': FILE_NAME,
            'DOWNLOAD_DELAY': 1,
            } 
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'
})

2022-09-25 19:10:41 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2022-09-25 19:10:41 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 3.4.8, Platform Windows-10-10.0.19043-SP0


In [3]:
process = CrawlerProcess(SETTINGS)
process.crawl(StandingsSpider)
process.start()

2022-09-25 19:10:43 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2022-09-25 19:10:43 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 3.4.8, Platform Windows-10-10.0.19043-SP0
2022-09-25 19:10:43 [scrapy.crawler] INFO: Overridden settings:
{'DOWNLOAD_DELAY': 1}
2022-09-25 19:10:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-09-25 19:10:43 [scrapy.extensions.telnet] INFO: Telnet Password: e24525ac940f27a9
  exporter = cls(crawler)

2022-09-25 19:10:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-09-25 19:10:44 [scrapy.middleware] INFO: Enabl

2022-09-25 19:10:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.formula1.com/en/results.html/2022/drivers.html>
{'position': '22', 'name': 'Nico', 'lastname': 'Hulkenberg', 'alias': 'HUL', 'country': 'GER', 'car': 'Aston Martin Aramco Mercedes', 'points': '0'}
2022-09-25 19:10:45 [scrapy.core.engine] INFO: Closing spider (finished)
2022-09-25 19:10:45 [scrapy.extensions.feedexport] INFO: Stored csv feed (22 items) in: f1_standings_.csv
2022-09-25 19:10:45 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 253,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 35438,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 0.433751,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 9, 25, 17, 10, 45, 420299),
 'httpcompression/response_bytes': 222750,
 'httpcompression/respons

In [4]:
f1_standings_ = pd.read_csv('f1_standings_.csv')
f1_standings_

Unnamed: 0,position,name,lastname,alias,country,car,points
0,1,Max,Verstappen,VER,NED,Red Bull Racing RBPT,335
1,2,Charles,Leclerc,LEC,MON,Ferrari,219
2,3,Sergio,Perez,PER,MEX,Red Bull Racing RBPT,210
3,4,George,Russell,RUS,GBR,Mercedes,203
4,5,Carlos,Sainz,SAI,ESP,Ferrari,187
5,6,Lewis,Hamilton,HAM,GBR,Mercedes,168
6,7,Lando,Norris,NOR,GBR,McLaren Mercedes,88
7,8,Esteban,Ocon,OCO,FRA,Alpine Renault,66
8,9,Fernando,Alonso,ALO,ESP,Alpine Renault,59
9,10,Valtteri,Bottas,BOT,FIN,Alfa Romeo Ferrari,46


## Conclusions

- We used three of the most common libraries to scrape websites, **BeautifulSoup, Selenium and Scrapy**. We scraped different websites with different features to have a wider vision of what we can achieve using these libraries.


- The use of the libraries requires a minimum knowledge of HTML. 


- We have achieved the same result using the three libraries, however, Beautifulsoup has been more efficient since its handling (and learning curve) make it a more user-friendly tool. On another hand, Scrapy is much more advanced and it is mostly used in more complex projects.
