# Сбор исторических погодных данных  
Скрип состоит из нескольких частей:  
* Парсинг названий городов России с Википедии  
* Добавление координат  
* Парсинг исторических погодных данных с open-meteo по городам  
* Объединение в один датафрейм для дальнейшего анализа

In [None]:
!pip install geopy
from geopy.geocoders import Nominatim

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas
from retry_requests import retry
import openmeteo_requests
import requests_cache

from google.colab import files
import glob

## Парсинг названий городов России с Википедии и добавление координат

In [None]:
# Ссылка на страницу Википедии со списком городов РФ
URL = "https://ru.wikipedia.org/wiki/%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%B3%D0%BE%D1%80%D0%BE%D0%B4%D0%BE%D0%B2_%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B8"
# Загружаем страницу
req = requests.get(URL)

soup = BeautifulSoup(req.text, 'html.parser')

# Извлекаем таблицу
table = soup.find('table', class_=['standard', 'sortable', 'jquery-tablesorter'])

cities = []

# Обработка таблицы
for row in table.find_all('tr'):
    cols = row.find_all('td')
    if cols:
        city = cols[2].get_text(strip=True).split('[')[0]
        cities.append(city)

# Сохраняем результат в файл
df = pd.DataFrame(cities, columns=['city'])

print(f'Сохранено {len(cities)} городов')


Сохранено 1125 городов


In [None]:
print(df)

           city
0         Абаза
1        Абакан
2      Абдулино
3        Абинск
4       Агидель
...         ...
1120  Ярославль
1121     Ярцево
1122  Ясногорск
1123      Ясный
1124     Яхрома

[1125 rows x 1 columns]


Так как встречаются города с одинаковыми названиями, находящиеся в разных регионах, сейчас удалим повторы. И скачаем информацию по ним отдельно.

In [None]:
df = df.drop_duplicates()
df.shape[0]

1106

Добавляем координаты (широту и долготу)

In [None]:
geolocator = Nominatim(user_agent="geo_cities_app")

#Функция возвращающая координаты
def get_coordinates(city):
    location = geolocator.geocode(f"{city}, Россия", timeout=10)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None


# Добавляем координаты
latitudes = []
longitudes = []

for city in df['city']:
    lat, lon = get_coordinates(city)
    latitudes.append(lat)
    longitudes.append(lon)
    time.sleep(1)  # пауза, чтобы не попасть под блокировку API

# Добавляем в датафрейм
df['latitude'] =latitudes
df['longitude'] = longitudes

In [None]:
df.to_csv('coordinates_cities.csv', index=False, encoding='utf-8-sig')
df.head()

Unnamed: 0,city,latitude,longitude
0,Абаза,52.651055,90.101159
1,Абакан,53.72068,91.440602
2,Абдулино,53.6828,53.655701
3,Абинск,44.864953,38.157819
4,Агидель,55.898963,53.934191


## Парсинг исторических погодных данных с open-meteo по городам

Над проектом работаем в группах, поэтому необходимо сделать список городов для скачивания.

In [None]:
# загружаем файл с распределением по городам
data_1 = pd.read_csv('/content/cityes_2.csv')

In [None]:
data_1 = data_1.query('user == "@Ekaterina_Smurova"')['name']

In [None]:
data_2 = df.query('city in @data_1')

Выгружаем погодные данные

In [None]:
# Настраиваем клиент OpenMeteo API с кэшем и повторяем попытку при ошибке
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)


url = "https://archive-api.open-meteo.com/v1/archive"

# функция извлечения нужных погодных данных
def load(cityname:str, year:int, lat:float, lon:float):
	params = {
		"latitude": lat,
		"longitude": lon,
		"start_date": f'{year}-01-01',
		"end_date": '2025-05-31' if year == 2025  else f'{year}-12-31',
		"hourly": ["temperature_2m", "snow_depth", "snowfall", "rain", "precipitation", "relative_humidity_2m", "wind_speed_100m", "wind_direction_100m", "is_day"],
		"temporal_resolution": "hourly_6"
	}
	responses = openmeteo.weather_api(url, params=params)

	#  Так как мы передаем координаты по одному, то обрабатываем только первый ответ
	response = responses[0]

	# Обрабатываем почасовые данные (код можно взять с сайта open-meteo)
	hourly = response.Hourly()
	hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
	hourly_snow_depth = hourly.Variables(1).ValuesAsNumpy()
	hourly_snowfall = hourly.Variables(2).ValuesAsNumpy()
	hourly_rain = hourly.Variables(3).ValuesAsNumpy()
	hourly_precipitation = hourly.Variables(4).ValuesAsNumpy()
	hourly_relative_humidity_2m = hourly.Variables(5).ValuesAsNumpy()
	hourly_wind_speed_100m = hourly.Variables(6).ValuesAsNumpy()
	hourly_wind_direction_100m = hourly.Variables(7).ValuesAsNumpy()
	hourly_is_day = hourly.Variables(8).ValuesAsNumpy()

	hourly_data = {"date": pd.date_range(
				start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
				end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
				freq = pd.Timedelta(seconds = hourly.Interval()),
				inclusive = "left"
			)}


	# Собираем все в датасет
	hourly_data['city'] = city
	hourly_data["temperature_2m"] = hourly_temperature_2m
	hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
	hourly_data["rain"] = hourly_rain
	hourly_data["snowfall"] = hourly_snowfall
	hourly_data["snow_depth"] = hourly_snow_depth
	hourly_data["is_day"] = hourly_is_day
	hourly_data["precipitation"] = hourly_precipitation
	hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
	hourly_data["wind_speed_100m"] = hourly_wind_speed_100m

	hourly_dataframe = pd.DataFrame(data = hourly_data)

	return hourly_dataframe

iter_cnt = 0
iter_lim = 10 # задаем количество выгрузок данных за один запуск

for index, row in data_2.iterrows():
	city, lat, lon = row['city'], row['latitude'], row['longitude']
	for year in range(1975, 2026):
		filename = f'{city}_{year}.csv'
		exists = True
		try:
			frame = pd.read_csv(filename)
		except:
			exists = False
		if not exists or frame.shape[1]==0:
			iter_cnt = iter_cnt+1
			if iter_cnt>iter_lim:
				break
			frame = load(city, year, lat, lon)
			frame['city'] = city
			frame.to_csv(filename, index=False)
		elif 'city' not in frame.columns:
			frame['city'] = city
			frame.to_csv(filename, index=False)

Coordinates 59.9296989440918°N 60.0°E
Elevation 228.0 m asl
Timezone NoneNone
Timezone difference to GMT+0 0 s


In [None]:
data = pd.read_csv('/content/Адыгейск_1975.csv')
data.head(10)

Unnamed: 0,date,city,temperature_2m,relative_humidity_2m,rain,snowfall,snow_depth,is_day,precipitation,wind_direction_100m,wind_speed_100m
0,1975-01-01 00:00:00+00:00,Адыгейск,1.683,98.93221,1.4,0.07,0.0,0.0,1.5,17.102825,14.689589
1,1975-01-01 06:00:00+00:00,Адыгейск,0.883,97.50814,1.4,1.12,0.01,1.0,3.0,358.99493,20.523155
2,1975-01-01 12:00:00+00:00,Адыгейск,-1.067,92.2338,0.0,6.37,0.06,1.0,9.1,39.55964,10.739832
3,1975-01-01 18:00:00+00:00,Адыгейск,-2.117,93.55056,0.0,1.19,0.07,0.0,1.7,34.69522,11.384198
4,1975-01-02 00:00:00+00:00,Адыгейск,-2.867,92.81415,0.0,0.07,0.07,0.0,0.1,53.972538,9.793059
5,1975-01-02 06:00:00+00:00,Адыгейск,-3.667,91.72396,0.0,0.0,0.08,1.0,0.0,86.906006,13.339445
6,1975-01-02 12:00:00+00:00,Адыгейск,-2.167,81.43409,0.0,0.0,0.07,1.0,0.0,61.049107,19.33639
7,1975-01-02 18:00:00+00:00,Адыгейск,-4.567,84.9236,0.0,0.0,0.07,0.0,0.0,47.12111,27.511158
8,1975-01-03 00:00:00+00:00,Адыгейск,-6.217,83.745445,0.0,0.0,0.07,0.0,0.0,52.193413,26.42753
9,1975-01-03 06:00:00+00:00,Адыгейск,-7.917,84.85854,0.0,0.0,0.07,1.0,0.0,47.93576,19.881649


## Загружаем все файлы и объединяем в один

In [None]:
uploaded_f =files.upload()

dfs = []
for name in uploaded_f.keys():
    dfs.append(pd.read_csv(name))

big_frame = pd.concat(dfs,ignore_index= True)

big_frame.to_csv('smurova_open_meteo.csv', index = False)
files.download('smurova_open_meteo.csv')