In [1]:
import datetime
from time import sleep, time
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import csv
from pathlib import Path

In [2]:
def get_load_time(article_url, user_agent):
    #будем ждать 3 секунды, иначе выводить exception и присваивать константное значение
    try:
        # меняем значение заголовка. По умолчанию указано, что это python-код
        headers = {
            "User-Agent": user_agent
        }
        # делаем запрос по url статьи article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # получаем время загрузки страницы
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = ">3"
    return load_time

In [3]:
def write_to_file(output_list, filename, base_dir):
    for row in output_list:
        with open(Path(base_dir).joinpath(filename), "a") as csvfile:
            fieldnames = ["id", "load_time","comments", "rank", "points", "title", "url"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

In [4]:
def connect_to_base(browser, page_number):
    base_url = "https://news.ycombinator.com/news?p={}".format(page_number)
    for connection_attempts in range(1,4): # совершаем 3 попытки подключения
        try:
            browser.get(base_url)
            # ожидаем пока элемент table с id = 'hnmain' будет загружен на страницу
            # затем функция вернет True иначе False 
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.ID, "hnmain"))
            )
            return True
        except Exception as e:
            print(e)
            print("Error connecting to {}.".format(base_url))
            print("Attempt #{}.".format(connection_attempts))
    return False

In [5]:
def parse_html(html, user_agent):  
    soup = BeautifulSoup(html, "html.parser")
    output_list = []
   
    # ищем в объекте soup object id, rank, score и title статьи
    tr_blocks = soup.find_all("tr", class_="athing")
    article = 0
    for tr in tr_blocks:
        article_id = tr.get("id") # id
        article_url = tr.find_all("a")[1]["href"]
        url_for_title = tr.find_all("a")[1]["href"]
        # иногда статья располагается не на внешнем сайте, а на ycombinator
        # тогда article_url у нее не полный, а добавочный, с параметрами.
        # например item?id=200933. Для этих случаев будем добавлять url до полного
        if "item?id=" in article_url or "from?site=" in article_url:
            article_url = f"https://news.ycombinator.com/{article_url}"
        load_time = get_load_time(article_url, user_agent)
        # иногда рейтинга может не быть, поэтому воспользуемся try

        try:
            score = soup.find(id=f"score_{article_id}").string
        except Exception as e:
            print(e)
            score = "0 points"

     
        try:
            comments = soup.find_all(href= f"item?id={article_id}")[1].string 
        except Exception as e:
            comments = '0 comments'

        if comments == 'discuss':
            comments = '0 comments'
        if comments.find("comment") == -1:
            comments = soup.find_all(href= f"item?id={article_id}")[2].string
            
        article_info = {
            "id": article_id,
            "load_time": load_time,
            "comments": comments,
            "rank": tr.span.string,
            "points": score,
            "title": soup.find(href= url_for_title).string,
            "url": article_url,
        }

        # добавляем информацию о статье в список
        output_list.append(article_info)
        article += 1
    return output_list

In [6]:
from concurrent.futures import ThreadPoolExecutor, wait

# Обернём процедуру парсинга страницы в функцию
def run_process(page_number, filename):
    browser = webdriver.Chrome(
        service=ChromeService(executable_path=driver_path)
    )
    if connect_to_base(browser, page_number):
        sleep(5)
        output_list = parse_html(browser.page_source, user_agent)
        write_to_file(output_list, filename, base_dir)
       
        browser.quit()
    else:
        print("Error connecting to hacker news")
        browser.quit()
       
filename = "articles_info.csv" # имя файла, в который будем сохранять результат
driver_path = 'C:/Users/miron/chromedriver-win64/chromedriver.exe'
base_dir= "C:/Users/miron//parse/" # укажите директорию, в которую будем сохранять файл
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" # ваш user-agent, узнать его можно тут: https://юзерагент.рф, смотреть через браузер Chrome
start_time = time() # время начала выполнения программы

futures = []

# Запустим процесс парсинга на нескольких потоках одновременно
with ThreadPoolExecutor() as executor:
    for number in range(10):
        futures.append(
            executor.submit(run_process, number, filename)
        )
       
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print("Elapsed run time: {} seconds".format(elapsed_time))


Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF73BDD52A2+57122]
	(No symbol) [0x00007FF73BD4EA92]
	(No symbol) [0x00007FF73BC1E3AB]
	(No symbol) [0x00007FF73BC57D3E]
	(No symbol) [0x00007FF73BC57E2C]
	(No symbol) [0x00007FF73BC90B67]
	(No symbol) [0x00007FF73BC7701F]
	(No symbol) [0x00007FF73BC8EB82]
	(No symbol) [0x00007FF73BC76DB3]
	(No symbol) [0x00007FF73BC4D2B1]
	(No symbol) [0x00007FF73BC4E494]
	GetHandleVerifier [0x00007FF73C07EF82+2849794]
	GetHandleVerifier [0x00007FF73C0D1D24+3189156]
	GetHandleVerifier [0x00007FF73C0CACAF+3160367]
	GetHandleVerifier [0x00007FF73BE66D06+653702]
	(No symbol) [0x00007FF73BD5A208]
	(No symbol) [0x00007FF73BD562C4]
	(No symbol) [0x00007FF73BD563F6]
	(No symbol) [0x00007FF73BD467A3]
	BaseThreadInitThunk [0x00007FFE29F326AD+29]
	RtlUserThreadStart [0x00007FFE2B04AA68+40]

Error connecting to https://news.ycombinator.com/news?p=0.
Attempt #1.
Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF73BDD52A2+57122]
	(No symbol) [0x00007FF73BD4EA92]
	(

In [7]:
import pandas as pd

articles_data = pd.read_csv(
    './parse/articles_info.csv',
    names=["id", "load_time","comments", "rank", "points", "title", "url"],
    encoding='cp1252'
)


In [8]:
articles_data.sample(50)

Unnamed: 0,id,load_time,comments,rank,points,title,url
238,37332303,1.104736,2 comments,33.0,13 points,The Story of Carrene (1935),https://pdfhost.io/v/oxZ.mlWg9_The_Story_of_Ca...
202,37340010,0.294847,296 comments,25.0,482 points,"When your coworker does great work, tell their...",https://jvns.ca/blog/2020/07/14/when-your-cowo...
243,37319461,0.315268,3 comments,38.0,36 points,Mimicking natural selection in chemical systems,https://www.nature.com/articles/s41570-019-0155-6
177,37306619,1.399054,14 comments,28.0,58 points,Lego Ideas: Insect Collection – Botanicals mee...,https://www.brothers-brick.com/2023/08/28/lego...
51,37314073,0.496739,83 comments,112.0,341 points,"Meta AI releases CoTracker, a model for tracki...",https://co-tracker.github.io/
208,37301991,0.402683,31 comments,243.0,190 points,WebLLM: Llama2 in the Browser,https://webllm.mlc.ai/
164,37342548,0.947291,7 comments,15.0,26 points,"The low, low cost of committing cybercrime",https://isc.sans.edu/diary/0
249,37333404,0.594811,146 comments,44.0,306 points,UTM – Virtual Machines for iOS and macOS,https://github.com/utmapp/UTM
275,37328750,0.30256,3 comments,160.0,20 points,Solid-body trajectoids shaped to roll along de...,https://www.nature.com/articles/s41586-023-063...
219,37327173,0.313573,21 comments,254.0,55 points,You Are Not the Man in the Arena,https://wheresyoured.at/p/you-are-not-the-man-...
