In [1]:
import requests
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

In [2]:
BASE_URL = "https://yandex.ru/pogoda/month/"
MONTHS = [
    "january", "february", "march", "april", 
    "may", "june", "july", "august", 
    "september", "october", "november", "december"
]

In [3]:
def fetch_single_page(url):
    try:
        r = requests.get(url)
    except Exception as e: 
        return ""
    if r.status_code // 100 != 2:
        return ""
    return r.text


def parse_weather_page(html, month="month"):
    soup = BeautifulSoup(html, "html.parser")
    # print(soup.prettify())
    # cells = soup.find_all("div", {"class": "climate-calendar__cell"})
    cells = soup.find_all(filter_tags)
    # print(cells)
    records = []
    for c in cells:
        if c.find("div", {"class": "climate-calendar-day_colorless_yes"}):
            continue
        temp_day_div = c.find("div", {"class": "climate-calendar-day__temp-day"})
        temp_day = temp_day_div.find("span", {"class": "temp__value"}).text
        temp_night_div = c.find("div", {"class": "climate-calendar-day__temp-night"})
        temp_night = temp_night_div.find("span", {"class": "temp__value"}).text
        
        records.append({
            "month": month, 
            "day": c.find("div", {"class": "climate-calendar-day__day"}).text,
            "temp_day":  temp_day,
            "temp-night": temp_night,
        })
    return records


def filter_tags(tag):
    cls = tag.get("class")
    return cls and "climate-calendar__cell" in cls and "climate-calendar__cell_header" not in cls


def process_months(base_url, months):
    records = []
    for m in months: 
        html = fetch_single_page(urljoin(base_url, m))
        records += parse_weather_page(html, m.capitalize())
    return records
        
    

In [4]:
html = fetch_single_page(BASE_URL)
month_records = parse_weather_page(html)
month_records[:3]

[{'month': 'month', 'day': '2', 'temp_day': '+26', 'temp-night': '+19'},
 {'month': 'month', 'day': '3', 'temp_day': '+27', 'temp-night': '+20'},
 {'month': 'month', 'day': '4', 'temp_day': '+24', 'temp-night': '+19'}]

In [5]:
year_records = process_months(BASE_URL, MONTHS)

In [6]:
pd.DataFrame.from_records(year_records)

Unnamed: 0,month,day,temp_day,temp-night
0,January,1,−2,−6
1,January,2,−5,−6
2,January,3,−7,−9
3,January,4,−8,−9
4,January,5,−8,−11
...,...,...,...,...
330,December,27,−3,−4
331,December,28,−4,−4
332,December,29,−3,−6
333,December,30,−4,−5
