In [8]:
from tqdm import tqdm
import requests
import zipfile
import os
from io import BytesIO
import pandas as pd
from tabulate import tabulate
import chardet
import itertools

In [9]:
EXTRACT_DIR = './data/raw'
YEARS = range(2010, 2026)
MONTHS = [f"{i:02d}" for i in range(1, 13)]
PROCEEDS_DIR = './data/processed'

In [10]:
def download_file(url, out_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

    
        with open(out_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
            
        return True
    except Exception as e:
        return False
        
def handle_zip(zip_path, rm_zip=True):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            extract_path = os.path.splitext(zip_path)[0]
            zip_ref.extractall(extract_path)
        if rm_zip:
            os.remove(zip_path)
        return True
    except Exception as e:
        return False
        

In [11]:
if not os.path.exists(EXTRACT_DIR):
    os.makedirs(EXTRACT_DIR, exist_ok=True)
    total_iterations = len(YEARS) * len(MONTHS)
    
    for y, m in tqdm(itertools.product(YEARS, MONTHS), total=total_iterations, desc="Download progress"):
            uri = f"https://danepubliczne.imgw.pl/pl/datastore/getfiledown/Arch/Telemetria/Meteo/{y}/Meteo_{y}-{m}.zip"
            filename = uri.split('/')[-1]
            out_path = os.path.join(EXTRACT_DIR, filename)
            if download_file(uri, out_path):

                if handle_zip(out_path, rm_zip=True) == False:
                    uri = f"https://danepubliczne.imgw.pl/pl/datastore/getfiledown/Arch/Telemetria/Meteo/{y}/Meteo_{y}-{m}.ZIP"
                    out_path = os.path.join(EXTRACT_DIR, filename, "_2")
                    if download_file(uri, out_path):
                        handle_zip(out_path, rm_zip=True) == False

Download progress: 100%|██████████| 192/192 [13:54<00:00,  4.35s/it]


In [12]:
paths = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    for file in files:
        if file.endswith('.zip'):
            os.remove(os.path.join(root, file))

In [13]:
os.makedirs(EXTRACT_DIR, exist_ok=True)

paths = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    for file in files:
        if file.endswith('.csv'):
            full_path = os.path.join(root, file)
            paths.append(full_path)
            
print(f"\nZnaleziono {len(paths)} plików CSV.")


Znaleziono 1771 plików CSV.


In [14]:
stations_df = pd.read_csv('./data/meta/kody_stacji.csv', sep=';')
stations_df = stations_df.sort_values(by='Nazwa')
print(tabulate(stations_df, headers='keys', tablefmt='psql'))

+-----+-------+-----------+-------------------------+--------------------------------+--------------------------+------------------------+-------------------+
|     |   LP. |        ID | Nazwa                   | Rzeka                          | Szerokość geograficzna   | Długość geograficzna   |   Wysokość n.p.m. |
|-----+-------+-----------+-------------------------+--------------------------------+--------------------------+------------------------+-------------------|
| 548 |   549 | 252220230 | Andrzejewo              | Brok (2671476)                 | 52 49 24                 | 22 11 07               |               117 |
| 363 |   364 | 250210030 | Annopol                 | Wisła (2)                      | 50 53 22                 | 21 50 03               |               165 |
| 501 |   502 | 252150270 | BABIMOST                | Gniła Obra (1568)              | 52 08 37                 | 15 48 12               |                57 |
| 179 |   180 | 249220170 | BALIGRÓD-MCHAWA   

In [15]:
headers = ["Station ID", "Param", "Datetime", "Value"]

CITIES_CODES = [351160424]

raw_dfs=[]
for file in tqdm(paths, desc="Wczytywanie plików CSV"):
    df = pd.read_csv(file, sep=';', header=None, names=headers, encoding='UTF-8', index_col=False, low_memory=False)
    df = df[df["Station ID"].isin(CITIES_CODES)]
    raw_dfs.append(df)

  df = pd.read_csv(file, sep=';', header=None, names=headers, encoding='UTF-8', index_col=False, low_memory=False)
Wczytywanie plików CSV: 100%|██████████| 1771/1771 [22:59<00:00,  1.28it/s]


In [16]:
all_data = pd.concat(raw_dfs, ignore_index=True)
all_data.reset_index(drop=True, inplace=True)

del raw_dfs

print(f"Rozmiar połączonego DataFrame: {all_data.shape}")
print(tabulate(all_data.head(), headers='keys', tablefmt='psql'))

Rozmiar połączonego DataFrame: (4508271, 4)
+----+--------------+---------+------------------+---------+
|    |   Station ID | Param   | Datetime         | Value   |
|----+--------------+---------+------------------+---------|
|  0 |    351160424 | B00300S | 2013-04-01 00:00 | -1,5    |
|  1 |    351160424 | B00300S | 2013-04-01 00:10 | -1,5    |
|  2 |    351160424 | B00300S | 2013-04-01 00:20 | -1,4    |
|  3 |    351160424 | B00300S | 2013-04-01 00:30 | -1,4    |
|  4 |    351160424 | B00300S | 2013-04-01 00:40 | -1,4    |
+----+--------------+---------+------------------+---------+


In [17]:
all_data['Value'] = pd.to_numeric(all_data['Value'], errors='coerce')
all_data_pivot = all_data.pivot_table(index=['Station ID', 'Datetime'], columns='Param', values='Value').reset_index()
print(f"Rozmiar DataFrame po pivotowaniu: {all_data_pivot.shape}")
print(tabulate(all_data_pivot.describe(), headers='keys', tablefmt='psql'))

Rozmiar DataFrame po pivotowaniu: (708934, 13)
+-------+-----------------+-------------+--------------+--------------+-------------+---------------+-----------------+--------------+--------------+------------+-------------+-----------+
|       |      Station ID |     B00202A |      B00300S |      B00305A |     B00604S |       B00606S |         B00608S |      B00702A |      B00703A |    B00714A |     B00802A |   B00910A |
|-------+-----------------+-------------+--------------+--------------+-------------+---------------+-----------------+--------------+--------------+------------+-------------+-----------|
| count | 708934          | 700510      | 130442       | 128591       | 1890        | 62413         | 217270          | 139437       | 231700       | 5470       | 708608      | 149       |
| mean  |      3.5116e+08 |    201.887  |     11.2761  |     11.0578  |    0.566085 |     0.0158925 |      0.00358034 |      2.71116 |      4.42785 |   12.2274  |     74.6113 |  11.9866  |
| std   

In [18]:
params_dic = {
    "B00300S": "Temperatura powietrza (oficjalna)",
    "B00305A": "Temperatura gruntu (czujnik)",
    "B00202A": "Kierunek wiatru (czujnik)",
    "B00702A": "Średnia prędkość wiatru czujnik 10 minut",
    "B00703A": "Prędkość maksymalna (czujnik)",
    "B00608S": "Suma opadu 10 minutowego",
    "B00604S": "Suma opadu dobowego",
    "B00606S": "Suma opadu godzinowego",
    "B00802A": "Wilgotność względna powietrza (czujnik)",
    "B00714A": "Największy poryw w okresie 10min ze stacji Synoptycznej",
    "B00910A": "Zapas wody w śniegu (obserwator)"
}

all_data_pivot.rename(columns=params_dic, inplace=True)
all_data_pivot['Datetime'] = pd.to_datetime(all_data_pivot['Datetime'], format='%Y-%m-%d %H:%M')
all_data_pivot.sort_values(by=['Station ID', 'Datetime'], inplace=True)
all_data_pivot.reset_index(drop=True, inplace=True)

print("OPIS")
print(tabulate(all_data_pivot.describe(), headers='keys', tablefmt='psql'))

print("GŁOWA")
print(tabulate(all_data_pivot.head(), headers='keys', tablefmt='psql'))

OPIS
+-------+-----------------+-------------------------------+-----------------------------+-------------------------------------+--------------------------------+-----------------------+--------------------------+----------------------------+--------------------------------------------+---------------------------------+-----------------------------------------------------------+-------------------------------------------+------------------------------------+
|       |      Station ID | Datetime                      |   Kierunek wiatru (czujnik) |   Temperatura powietrza (oficjalna) |   Temperatura gruntu (czujnik) |   Suma opadu dobowego |   Suma opadu godzinowego |   Suma opadu 10 minutowego |   Średnia prędkość wiatru czujnik 10 minut |   Prędkość maksymalna (czujnik) |   Największy poryw w okresie 10min ze stacji Synoptycznej |   Wilgotność względna powietrza (czujnik) |   Zapas wody w śniegu (obserwator) |
|-------+-----------------+-------------------------------+--------------

In [19]:
os.makedirs(PROCEEDS_DIR, exist_ok=True)
YEARS = all_data_pivot['Datetime'].dt.year.unique()
print(f"YEARS: {YEARS}")

for year in tqdm(YEARS):
    fin_path = os.path.join(PROCEEDS_DIR, f"meteo_wroclaw_{year}.csv")
    df_year = all_data_pivot[all_data_pivot['Datetime'].dt.year == year]
    df_year.to_csv(fin_path, index=False, encoding='UTF-8')

YEARS: [2010 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
 2025]


100%|██████████| 15/15 [00:09<00:00,  1.56it/s]
