## Ejercicio Web Scraping bs4/Selenium

_**url** = https://www.20minutos.es/_

Vamos a hacer Web Scraping de las primeras 3 páginas de una de las siguientes categorias de noticias: **Ciencia**, **Deporte**, **Gente**, **Economía**, **Grastronomía** y **Opinión**.

Y de cada noticia/articulo vamos a obtener:

- **Titulo**
- **Hora**
- **Fecha**
- **Autor**
- **Texto completo**
- **Categoria de la noticia**

Genera un DataFrame con esta información y guardalo en el archivo **`20minutos_1.csv`**.

In [88]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

from time import sleep

In [2]:
import selenium

selenium.__version__

'4.18.1'

In [71]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [72]:
browser.get("https://www.20minutos.es/")

browser.maximize_window()

In [73]:
browser.find_element(by = By.CLASS_NAME, value = "mrf-button.accept").click()

In [74]:
browser.find_element(by = By.ID, value = "ui-toggle-menu").click()

In [75]:
browser.find_element(by = By.PARTIAL_LINK_TEXT, value = "Deportes").click()

In [76]:
# Sacar las urls...

In [77]:
soup = BeautifulSoup(browser.page_source, "html.parser")

In [78]:
articulos = soup.find_all("article")

In [79]:
urls = [x.find("a")['href'] for x in articulos]

In [80]:
urls[0]

'https://www.20minutos.es/deportes/noticia/5222417/0/pedro-sanchez-ilia-topuria-dni-espanol-deseo/'

In [84]:
def obtener_info_noticia(noticia):
    informacion = {}

    informacion["titulo"] = noticia.find("h1", class_ = "article-title").text
    informacion["fecha"] = noticia.find("span", class_ = "article-date").text
    informacion["autor"] = noticia.find("span", class_ = "article-author").find("strong").text
    informacion["categoria"] = noticia.find("div", class_ = "category-title").find("a").text
    informacion["texto"] = noticia.find("div", class_ = "article-text").text
    return informacion

In [85]:
data = []
for url in urls[:5]:
    browser.get(url)
    sleep(2)
    soup = BeautifulSoup(browser.page_source, "html.parser")
    info_noticia = obtener_info_noticia(soup)
    data.append(info_noticia)

In [89]:
pd.DataFrame(data)

Unnamed: 0,titulo,fecha,autor,categoria,texto
0,Pedro Sánchez le concede a Ilia Topuria su gra...,27.02.2024 - 13:31h,R. R. Z.,Deportes,\nPedro Sánchez recibió este martes a Ilia Top...
1,Qué necesita Ilia Topuria para obtener el DNI ...,27.02.2024 - 13:59h,Desirée Redondo,Deportes,"\n""Mi sueño sería conseguir mi DNI español, ma..."
2,Pedro Sánchez mantiene un cordial encuentro co...,27.02.2024 - 13:42h,20minutos,Deportes,\nPedro Sánchez ha recibido este martes al luc...
3,El otro objetivo del Emir de Qatar en la cena ...,27.02.2024 - 09:48h,20minutos,Fútbol,\nEl futuro de Kylian Mbappé parece ir definié...
4,"Zidane... ¿a la espera de una buena oferta?: ""...",27.02.2024 - 10:27h,20minutos,Fútbol,\nHan pasado ya casi tres años desde que Zined...


- **Escribe el código para sacar la información de las primeras 5 páginas de las 6 categorias anteriores. Guarda esta información en el archivo `20minutos_2.csv`**

In [93]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [94]:
url = "https://www.kayak.es/"

In [95]:
browser.get(url)

In [97]:
browser.find_element(by = By.CLASS_NAME, value = "RxNS").click()

In [107]:
browser.find_element(by = By.CLASS_NAME, value = "yWJT-new-nav-ux").click()

In [108]:
browser.find_element(by = By.PARTIAL_LINK_TEXT, value = "Coches").click()

In [109]:
browser.find_element(by = By.CLASS_NAME, value = "NhpT-mod-theme-search").click()

In [116]:
buscador = browser.find_element(by = By.CLASS_NAME, value = "NhpT-mod-theme-search")

In [117]:
buscador.send_keys("Madrid")

In [118]:
browser.find_element(by = By.CLASS_NAME, value = "A_8a-icon").click()

In [135]:
browser.switch_to.window(browser.window_handles[1])

In [137]:
soup = BeautifulSoup(browser.page_source, "html.parser")

In [138]:
coches = soup.find_all("div", class_ = "jo6g")

In [142]:
coches[0].find("div", class_ = "MseY-title js-title").text

'Ford Fiesta'

In [145]:
browser.window_handles

['4911EE495E310883E2A90FAA77033F3A',
 'A9C2224D94690E91E50EE838C1CD8FD9',
 'CF18E4F7CB1944623EE3A621D253F4E6',
 'E7A8C016B842E3A846FD66166B9C48C4',
 '139791889F9E6FF851C9B13AD587F409']

### Youtube

In [154]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [155]:
browser.get("https://www.youtube.com/")

In [157]:
browser.find_element(by = By.XPATH, value = "/html/body/ytd-app/ytd-consent-bump-v2-lightbox/tp-yt-paper-dialog/div[4]/div[2]/div[6]/div[1]/ytd-button-renderer[2]/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]").click()

In [158]:
buscador = browser.find_element(by = By.NAME, value = "search_query")

In [159]:
buscador.send_keys("tennis")

In [160]:
buscador.submit()

In [161]:
soup = BeautifulSoup(browser.page_source, "html.parser")

In [163]:
attrs = {"id": "video-title", "class" : "ytd-video-renderer"}

In [165]:
videos = soup.find_all("a", attrs = attrs)

In [167]:
videos[0]['href']

'/watch?v=ZfHd7DL2vxA&pp=ygUGdGVubmlz'

In [172]:
video_urls = ["https://www.youtube.com" + video['href'] for video in videos]

In [175]:
video_urls = [video_url for video_url in video_urls if "short" not in video_url]

In [198]:
data = []
for video_url in video_urls[:5]:
    print(f"Analizando página {video_url}")
    browser.get(url=video_url)
    sleep(2)
    browser.find_element(by = By.ID, value = "expand").click()
    sleep(2)
    soup = BeautifulSoup(browser.page_source, "html.parser")

    titulo = soup.find("h1", class_ = "style-scope ytd-watch-metadata").find("yt-formatted-string").text

    info = [x.text for x in soup.find("div", id = "info-container").find_all("span")]
    info = [x for x in info if len(x) > 1]
    views, fecha = info

    nombre_canal = soup.find("ytd-channel-name").find("a").text
    subscribers = soup.find("yt-formatted-string", id = "owner-sub-count")
    subscribers = subscribers.text if subscribers else None

    data.append([titulo, views, fecha, nombre_canal, subscribers])

Analizando página https://www.youtube.com/watch?v=ZfHd7DL2vxA&pp=ygUGdGVubmlz
Analizando página https://www.youtube.com/watch?v=mkOwGS3zjnE&pp=ygUGdGVubmlz
Analizando página https://www.youtube.com/watch?v=-CgtDxipLkY&pp=ygUGdGVubmlz
Analizando página https://www.youtube.com/watch?v=rF83nYVOFKo&pp=ygUGdGVubmlz
Analizando página https://www.youtube.com/watch?v=1cbGeafJIXY&pp=ygUGdGVubmlz


In [201]:
pd.DataFrame(data, columns = ["Titulo", "Views", "Fecha", "Nombre_canal", "Subscribers"])

Unnamed: 0,Titulo,Views,Fecha,Nombre_canal,Subscribers
0,"Shelton Takes On Evans; Fritz, De Minaur & Mor...","134,363 views","Feb 27, 2024",Tennis TV,1.43M subscribers
1,"Medvedev Returns To Tour; Khachanov, Mensik & ...","14,648 views","Feb 27, 2024",Tennis TV,1.43M subscribers
2,Daniil Medvedev First Match Since Australian O...,"21,761 views","Feb 27, 2024",Tennis TV,1.43M subscribers
3,"Diaz Acosta faces Cachin; Moutet, Coria & More...","35,935 views","Feb 27, 2024",Tennis TV,1.43M subscribers
4,Elena Rybakina Vs Elina Svitolina - EPIC Unsto...,"7,548 views",7 hours ago,Tennis Fedose,1.54K subscribers


In [208]:
from typing import Dict, List, Any

In [222]:
class Mensajes:
    def process_message(self, data: Dict[str, Any]) -> None:
        """
        Processes messages received from the WebSocket connection.

        :param ws: The WebSocket connection.
        :param data: The data received in the message.
        """
        action_handlers = {
            "add-provider": self.handle_add_provider,
            "clear-providers": self.handle_clear_providers,
            "clear-prices": self.handle_clear_prices
        }

        

        action = data.get("action")

        a = getattr(self, f"handle_{action.replace('-','_')}")
        print(a)
        
        handler = action_handlers.get(action)

        if handler:
            handler(data)
        else:
            print("fail")

    def handle_add_provider(self, data: Dict[str, Any]) -> None:
        """
        Adds a new data provider.

        :param ws: The WebSocket connection.
        :param data: The data specifying the provider to add.
        """
        print("new provider")

    def handle_clear_providers(self, data: Dict[str, Any]) -> None:
        """
        Clears all registered data providers.
        """
        print("clear provider")

    def handle_clear_prices(self, data: Dict[str, Any]) -> None:
        """
        Clears all price data from the storage."""
        print("clear prices")

In [223]:
m = Mensajes()

In [224]:
m.process_message(data = {"action": "add-provider"})

<bound method Mensajes.handle_add_provider of <__main__.Mensajes object at 0x7efbb8798ee0>>
new provider


In [225]:
a = [1,2,7]

In [226]:
a.remove(7)

In [227]:
a

[1, 2]