# Exercici 1

Realitza web scraping d'una pàgina de la borsa de Madrid (https://www.bolsamadrid.es) utilitzant BeautifulSoup i Selenium.

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options

### Web Scraping usando BeautifulSoup

In [2]:
URL = "https://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
table = soup.find('table', attrs={'id': 'ctl00_Contenido_tblÍndices'})

In [4]:
matrix_row = []
matrix = []
columns_name = []
header_row = True
for row in table.find_all("tr"):
    if header_row:
        header_row = False
        for column in row.find_all('th'):
            columns_name.append(column.text)
    else:
        matrix_row = []
        for column in row.find_all('td'):
                if column.text.strip() == '-':
                    matrix_row.append('0')
                else:
                    matrix_row.append(column.text)
        matrix.append(matrix_row)

In [5]:
df = pd.DataFrame(matrix, columns =columns_name)
df

Unnamed: 0,Nombre,Anterior,Último,% Dif.,Máximo,Mínimo,Fecha,Hora,% Dif.Año 2021
0,IBEX 35®,"8.840,90","8.402,70",-496,"8.592,30","8.386,90",26/11/2021,17:38:00,407
1,IBEX 35® con Dividendos,"27.187,90","25.846,10",-494,"26.410,30","25.797,60",26/11/2021,17:38:00,653
2,IBEX MEDIUM CAP®,"13.411,60","12.992,60",-312,"13.193,00","12.992,60",26/11/2021,17:38:00,218
3,IBEX SMALL CAP®,"8.256,10","7.862,50",-477,"8.150,50","7.839,00",26/11/2021,17:38:00,-291
4,IBEX 35® Bancos,46980,43520,-736,45250,43520,26/11/2021,17:38:00,1571
...,...,...,...,...,...,...,...,...,...
73,Índice ITX Inverso X3,17370,19940,1480,20340,18140,26/11/2021,17:38:00,-6103
74,Índice TEF Inverso X5,"10.528,00","12.036,30",1433,"12.801,40","11.480,50",26/11/2021,17:38:00,525
75,Índice SAN Inverso X5,"4.040,00","5.838,10",4451,"5.838,10","4.932,40",26/11/2021,17:38:00,97799
76,Índice BBVA Inverso X5,"8.734,00","11.933,10",3663,"11.933,10","10.196,30",26/11/2021,17:38:00,83103


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Nombre          78 non-null     object
 1   Anterior        78 non-null     object
 2   Último          78 non-null     object
 3   % Dif.          78 non-null     object
 4   Máximo          78 non-null     object
 5   Mínimo          78 non-null     object
 6   Fecha           78 non-null     object
 7   Hora            78 non-null     object
 8   % Dif.Año 2021  78 non-null     object
dtypes: object(9)
memory usage: 5.6+ KB


In [7]:
df.to_csv('bolsa_beautiful_soup.csv')

### Web Scriaping usando Selenium

In [8]:
browser = Firefox()
browser.get('https://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx')
search_form = browser.find_element_by_id('ctl00_Contenido_tblÍndices')

In [9]:
rows = search_form.find_elements_by_tag_name("tr")
selenium_matrix_row = []
selenium_matrix = []
selenium_columns_name = []
header_row = True
for row in rows:
    if header_row:
        header_row = False
        for column in row.find_elements_by_tag_name('th'):
            selenium_columns_name.append(column.text)
    else:
        selenium_matrix_row = []
        for column in row.find_elements_by_tag_name('td'):
            if column.text.strip() == '-':
                selenium_matrix_row.append('0')
            else:
                selenium_matrix_row.append(column.text)
        selenium_matrix.append(selenium_matrix_row)

In [10]:
selenium_df = pd.DataFrame(selenium_matrix, columns =selenium_columns_name)
selenium_df

Unnamed: 0,Nombre,Anterior,Último,% Dif.,Máximo,Mínimo,Fecha,Hora,% Dif.\nAño 2021
0,IBEX 35®,"8.840,90","8.402,70",-496,"8.592,30","8.386,90",26/11/2021,17:38:00,407
1,IBEX 35® con Dividendos,"27.187,90","25.846,10",-494,"26.410,30","25.797,60",26/11/2021,17:38:00,653
2,IBEX MEDIUM CAP®,"13.411,60","12.992,60",-312,"13.193,00","12.992,60",26/11/2021,17:38:00,218
3,IBEX SMALL CAP®,"8.256,10","7.862,50",-477,"8.150,50","7.839,00",26/11/2021,17:38:00,-291
4,IBEX 35® Bancos,46980,43520,-736,45250,43520,26/11/2021,17:38:00,1571
...,...,...,...,...,...,...,...,...,...
73,Índice ITX Inverso X3,17370,19940,1480,20340,18140,26/11/2021,17:38:00,-6103
74,Índice TEF Inverso X5,"10.528,00","12.036,30",1433,"12.801,40","11.480,50",26/11/2021,17:38:00,525
75,Índice SAN Inverso X5,"4.040,00","5.838,10",4451,"5.838,10","4.932,40",26/11/2021,17:38:00,97799
76,Índice BBVA Inverso X5,"8.734,00","11.933,10",3663,"11.933,10","10.196,30",26/11/2021,17:38:00,83103


In [11]:
selenium_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Nombre           78 non-null     object
 1   Anterior         78 non-null     object
 2   Último           78 non-null     object
 3   % Dif.           78 non-null     object
 4   Máximo           78 non-null     object
 5   Mínimo           78 non-null     object
 6   Fecha            78 non-null     object
 7   Hora             78 non-null     object
 8   % Dif.
Año 2021  78 non-null     object
dtypes: object(9)
memory usage: 5.6+ KB


In [12]:
selenium_df.to_csv('bolsa_selenium.csv')

In [13]:
browser.close()

# Exercici 3

Tria una página web que tu vulguis i realitza web scraping mitjançant la llibreria Scrapy. 

El siguiente Web Scraping se encarga de leer la tabla de posiciones de La Liga y exportar los resultados en un fichero CSV.

In [14]:
import scrapy
import csv
import logging
from scrapy.item import Item, Field
from scrapy.crawler import CrawlerProcess

In [15]:
# Clase que define lo datos de la tabla a exportor
class Tabla(Item):
    equipo = Field()
    puntos = Field()
    partidos_jugados = Field()
    partidos_ganados = Field()
    partidos_empatados = Field()
    partidos_perdidos = Field()
    goles_favor = Field()
    goles_contra = Field()
    diferencia_gol = Field()

# Creamos el pipeline que se en encargará de exportar los datos al fichero CSV
class CSVWriterPipeline(object):
    def open_spider(self, spider):
        self.writeHeader = True
        self.file = open('LaLiga.csv', 'w')
        self.csvwriter = csv.writer(self.file)

    def process_item(self, item, spider):
        return item
    
    def close_spider(self, spider):
        self.file.close()

In [16]:
# Creamos nuestro Spider, donde definimos la url, y como será la lectura del codigo html
class PosicionesSpider(scrapy.Spider):
    name = "posiciones"
    start_urls = [
        'https://www.promiedos.com.ar/espana'
    ]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.CSVWriterPipeline': 1},
        'FEED_FORMAT':'csv',                             
        'FEED_URI': 'LaLiga.csv'                       
    }
    
    def parse(self, response):
        items = []
        for url in response.css('[id="posiciones"]').css('tbody').css('tr'):
            item = Tabla()
            item['equipo'] = url.css('td::text')[1].extract()
            item['puntos'] = url.css('td::text')[2].extract()
            item['partidos_jugados'] = url.css('td::text')[3].extract()
            item['partidos_ganados'] = url.css('td::text')[4].extract()
            item['partidos_empatados'] = url.css('td::text')[5].extract()
            item['partidos_perdidos'] = url.css('td::text')[6].extract()
            item['goles_favor'] = url.css('td::text')[7].extract()
            item['goles_contra'] = url.css('td::text')[8].extract()
            item['diferencia_gol'] = url.css('td::text')[8].extract()
            items.append(item)
        return items

In [17]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(PosicionesSpider)
process.start()

2021-11-27 15:54:30 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-11-27 15:54:30 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 21.7.0, Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Windows-10-10.0.18362-SP0
2021-11-27 15:54:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-11-27 15:54:30 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
  exporter = cls(crawler)



In [18]:
df = pd.read_csv('LaLiga.csv')
df.head()

Unnamed: 0,diferencia_gol,equipo,goles_contra,goles_favor,partidos_empatados,partidos_ganados,partidos_jugados,partidos_perdidos,puntos
0,14,Real Madrid,14,32,3,9,13,1,30
1,10,Real Sociedad,10,19,5,8,14,1,29
2,9,Sevilla,9,23,4,8,13,1,28
3,13,Atletico Madrid,13,22,5,7,13,1,26
4,17,Betis,17,22,3,7,14,4,24
