In [187]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

# Проверка доступности GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [188]:
df = pd.read_csv("weather_2022.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152256 entries, 0 to 152255
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  152256 non-null  int64  
 1   time        152256 non-null  object 
 2   weather_id  152256 non-null  int64  
 3   city_id     152256 non-null  int64  
 4   tavg        152256 non-null  float64
 5   tmin        152256 non-null  float64
 6   tmax        152256 non-null  float64
 7   prcp        152256 non-null  float64
 8   wspd        152256 non-null  float64
dtypes: float64(5), int64(3), object(1)
memory usage: 10.5+ MB


In [189]:
def clean_data(df):
    # Удаление ненужных колонок
    df = df.drop(columns=['Unnamed: 0', 'weather_id'], errors='ignore')
    df['time'] = pd.to_datetime(df['time'])

    # Объединение результатов
    df['month'] = df['time'].dt.month.astype('int8')
    df['day_of_year'] = df['time'].dt.dayofyear.astype('int16')

    # Оптимизация типов
    float_cols = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd']
    df[float_cols] = df[float_cols].astype('float32')
    df['city_id'] = df['city_id'].astype('int32')

    print("Обработка завершена. Статистика:")
    print(f"Всего строк: {len(df)}")

    return df.reset_index(drop=True)
cleaned_data = clean_data(df.copy())
cleaned_data.info()

Обработка завершена. Статистика:
Всего строк: 152256
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152256 entries, 0 to 152255
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   time         152256 non-null  datetime64[ns]
 1   city_id      152256 non-null  int32         
 2   tavg         152256 non-null  float32       
 3   tmin         152256 non-null  float32       
 4   tmax         152256 non-null  float32       
 5   prcp         152256 non-null  float32       
 6   wspd         152256 non-null  float32       
 7   month        152256 non-null  int8          
 8   day_of_year  152256 non-null  int16         
dtypes: datetime64[ns](1), float32(5), int16(1), int32(1), int8(1)
memory usage: 5.1 MB


In [190]:
from collections import defaultdict

def prepare_data(df, sequence_length=360, target_length=90, step=7):
    """
    Подготавливает данные для обучения модели LSTM
    Возвращает данные, нормализованные отдельно по каждому городу
    """
    # Добавляем синусоиды для сезонных признаков
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    df.drop(columns=['day_of_year', 'month'], inplace=True)

    # Группировка данных по городам
    city_weather = defaultdict(list)
    grouped = df.groupby('city_id')
    for city_id, group in grouped:
        city_weather[city_id] = group.sort_values('time').reset_index(drop=True)
    
    # Признаки для обучения и предсказания
    input_features = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 'month_sin', 'month_cos',  'doy_sin', 'doy_cos']
    target_features = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd']

    all_X_train, all_y_train = [], []
    all_X_test, all_y_test = [], []
    scalers = {}


    for city_id, city_df in city_weather.items():
        train_size = int(0.8 * len(city_df))
        train_data = city_df.iloc[:train_size]
        test_data = city_df.iloc[train_size:]

        # Масштабируем только нужные признаки
        scaler = StandardScaler()
        train_scaled = scaler.fit_transform(train_data[input_features])
        test_scaled = scaler.transform(test_data[input_features])
        scalers[city_id] = scaler

        # После масштабирования: train_scaled и test_scaled уже NumPy
        # Важно — определить индексы таргет-признаков
        target_indices = [input_features.index(f) for f in target_features]
        
        def create_sequences(data, seq_len, tgt_len, step):
            X, y = [], []
            for i in range(0, len(data) - seq_len - tgt_len + 1, step):
                x_seq = data[i:i+seq_len]
                y_seq = data[i+seq_len:i+seq_len+tgt_len, target_indices]  # ТОЛЬКО таргеты
                X.append(x_seq)
                y.append(y_seq)
            return np.array(X), np.array(y)


        X_train_city, y_train_city = create_sequences(train_scaled, sequence_length, target_length, step)
        X_test_city, y_test_city = create_sequences(test_scaled, sequence_length, target_length, step)

        if len(X_train_city) > 0:
            all_X_train.append(X_train_city)
            all_y_train.append(y_train_city)
        if len(X_test_city) > 0:
            all_X_test.append(X_test_city)
            all_y_test.append(y_test_city)

    # Объединяем всё в массивы
    X_train = np.concatenate(all_X_train, axis=0) if all_X_train else np.array([])
    y_train = np.concatenate(all_y_train, axis=0) if all_y_train else np.array([])
    X_test = np.concatenate(all_X_test, axis=0) if all_X_test else np.array([])
    y_test = np.concatenate(all_y_test, axis=0) if all_y_test else np.array([])

    return X_train, y_train, X_test, y_test, scalers, city_weather

X_train, y_train, X_test, y_test, scalers, city_weather = prepare_data(cleaned_data.copy(), sequence_length=90, target_length=30, step=7)

if len(X_train) > 0:
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test = torch.tensor(y_test, dtype=torch.float32).to(device)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32, shuffle=True)
else:
    print("Недостаточно данных для создания последовательностей.")


In [191]:
city_weather[1]

Unnamed: 0,time,city_id,tavg,tmin,tmax,prcp,wspd,month_sin,month_cos,doy_sin,doy_cos
0,2022-05-01,1,6.8,-0.7,13.0,0.9,2.5,0.5,-8.660254e-01,0.871706,-0.490029
1,2022-05-02,1,10.3,5.0,14.1,0.0,2.4,0.5,-8.660254e-01,0.863142,-0.504961
2,2022-05-03,1,12.1,7.2,17.6,0.1,4.5,0.5,-8.660254e-01,0.854322,-0.519744
3,2022-05-04,1,6.8,3.3,9.3,1.4,6.7,0.5,-8.660254e-01,0.845249,-0.534373
4,2022-05-05,1,5.7,0.8,10.0,0.0,4.8,0.5,-8.660254e-01,0.835925,-0.548843
...,...,...,...,...,...,...,...,...,...,...,...
1059,2025-03-25,1,4.9,3.0,6.7,3.9,6.9,1.0,6.123234e-17,0.992222,0.124479
1060,2025-03-26,1,3.4,1.8,5.9,0.0,1.6,1.0,6.123234e-17,0.994218,0.107381
1061,2025-03-27,1,5.5,2.6,7.0,1.9,1.8,1.0,6.123234e-17,0.995919,0.090252
1062,2025-03-28,1,6.6,4.1,9.6,0.0,1.8,1.0,6.123234e-17,0.997325,0.073095


In [192]:
print(len(train_loader),len(test_loader))
for a, b in train_loader:
    print(a.shape,b.shape)
    break

470 63
torch.Size([32, 90, 9]) torch.Size([32, 30, 5])


In [193]:
class WeatherLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, target_length):
        super(WeatherLSTM, self).__init__()
        self.target_length = target_length
        self.output_size = output_size
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, target_length * output_size)

    def forward(self, x):
        # x: [batch, seq_len, input_size]
        lstm_out, _ = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]  # [batch, hidden_size]
        out = self.fc(last_hidden)        # [batch, target_length * output_size]
        out = out.view(-1, self.target_length, self.output_size)  # [batch, 30, 5]
        return out


In [194]:
from tqdm import tqdm
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=20):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()

            preds = model(batch_x)  # [batch, target_length, output_size]
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.6f}")

        # Валидация
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for batch_x, batch_y in test_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                preds = model(batch_x)
                val_loss += criterion(preds, batch_y).item()
            print(f"Validation Loss: {val_loss / len(test_loader):.6f}")
            
    return model


In [195]:
model = WeatherLSTM(
    input_size=9,
    hidden_size=128,
    num_layers=2,
    output_size=5,
    target_length=30
).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

trained_model = train_model(model, train_loader, test_loader, criterion, optimizer, epochs=3)
torch.save(trained_model.state_dict(), "trained_weather_model.pth")


Epoch 1/3: 100%|██████████| 470/470 [00:02<00:00, 188.35it/s]


Epoch 1, Train Loss: 0.498867
Validation Loss: 0.466009


Epoch 2/3: 100%|██████████| 470/470 [00:02<00:00, 205.60it/s]


Epoch 2, Train Loss: 0.455546
Validation Loss: 0.466194


Epoch 3/3: 100%|██████████| 470/470 [00:02<00:00, 205.32it/s]


Epoch 3, Train Loss: 0.429867
Validation Loss: 0.457496


In [196]:
def predict_future_weather(model, scalers_dict, city_weather, forecast_days=30, sequence_length=90):
    """
    Прогнозирует погоду на forecast_days дней вперёд для каждого города.
    Теперь модель предсказывает сразу весь блок [30, 5] за один шаг.
    После предсказания делается обратная нормализация для восстановления оригинальных значений.
    """
    model.eval()

    target_columns = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd']
    input_features = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 
                      'month_sin', 'month_cos', 'doy_sin', 'doy_cos']
    
    predictions = {}

    with torch.no_grad():
        for city_id, city_df in city_weather.items():
            scaler = scalers_dict[city_id]

            city_df = city_df.sort_values('time').reset_index(drop=True)
            last_sequence = city_df[-sequence_length:].copy()

            if len(last_sequence) < sequence_length:
                print(f"Недостаточно данных для города {city_id}, пропускаем.")
                continue

            # Проверка наличия всех необходимых признаков в последней последовательности
            if not all(feature in last_sequence.columns for feature in input_features):
                print(f"Для города {city_id} отсутствуют нужные признаки, пропускаем.")
                continue

            # Выбираем последние 90 дней с нужными фичами
            last_features = last_sequence[input_features].copy()
            scaled = scaler.transform(last_features)

            input_seq = torch.tensor(scaled, dtype=torch.float32).unsqueeze(0).to(device)  # [1, seq_len, features]

            # Прогноз сразу на 30 дней вперёд
            pred = model(input_seq)  # [1, 30, 5]
            pred_np = pred.squeeze(0).cpu().numpy()  # [30, 5]

            # Создаём даты прогноза
            last_date = pd.to_datetime(last_sequence['time'].iloc[-1])
            forecast_dates = [last_date + pd.Timedelta(days=i) for i in range(1, forecast_days + 1)]

            # Преобразуем предсказания в оригинальные единицы
            # Создаём dummy массив для обратной трансформации (т.к. scaler ждёт shape [*, 9])
            dummy_input = np.zeros((forecast_days, len(input_features)))
            dummy_input[:, :5] = pred_np  # Только первые 5 фичей — это таргеты
            pred_original = scaler.inverse_transform(dummy_input)[:, :5]

            forecast_df = pd.DataFrame(pred_original, columns=target_columns)
            forecast_df['time'] = forecast_dates
            forecast_df['city_id'] = city_id
            forecast_df = forecast_df[['time', 'city_id'] + target_columns]

            predictions[city_id] = forecast_df

    return predictions


In [197]:
# Загружаем лучшую модель
model.load_state_dict(torch.load('trained_weather_model.pth'))

# Получаем прогнозы для всех городов
weather_forecasts = predict_future_weather(
    model=model,
    scalers_dict=scalers,
    city_weather=city_weather,
    forecast_days=30
)

# Пример доступа к прогнозам для конкретного города
city_id = 1  # пример city_id
forecast_for_city = weather_forecasts[city_id]
print(forecast_for_city.head())

        time  city_id      tavg      tmin       tmax      prcp      wspd
0 2025-03-30        1  8.589891  3.606063  11.898775  0.943502  3.237224
1 2025-03-31        1  6.953579  3.728670  12.067101  1.773439  4.063935
2 2025-04-01        1  7.552721  3.664164  11.797680  1.528804  4.042269
3 2025-04-02        1  7.854909  3.684960  11.603023  1.717092  4.180740
4 2025-04-03        1  7.851921  3.913250  10.462488  1.615091  4.256344


In [198]:
city_weather[1].tail(n=40)

Unnamed: 0,time,city_id,tavg,tmin,tmax,prcp,wspd,month_sin,month_cos,doy_sin,doy_cos
1024,2025-02-18,1,-8.1,-11.1,-5.4,0.0,1.6,0.866025,0.5,0.746972,0.664855
1025,2025-02-19,1,-5.5,-7.9,-4.2,0.2,2.3,0.866025,0.5,0.758306,0.651899
1026,2025-02-20,1,-7.6,-8.9,-6.2,0.0,3.7,0.866025,0.5,0.769415,0.638749
1027,2025-02-21,1,-5.5,-8.7,-2.4,0.0,2.4,0.866025,0.5,0.780296,0.625411
1028,2025-02-22,1,-4.2,-6.4,-1.1,0.0,2.2,0.866025,0.5,0.790946,0.611886
1029,2025-02-23,1,-6.5,-10.2,-2.1,0.0,0.9,0.866025,0.5,0.801361,0.598181
1030,2025-02-24,1,-7.7,-13.2,-2.2,0.0,3.6,0.866025,0.5,0.811539,0.584298
1031,2025-02-25,1,-6.8,-12.6,-0.2,0.0,1.0,0.866025,0.5,0.821477,0.570242
1032,2025-02-26,1,-4.2,-8.6,0.1,0.0,1.9,0.866025,0.5,0.831171,0.556017
1033,2025-02-27,1,-3.8,-9.2,2.6,0.0,0.6,0.866025,0.5,0.840618,0.541628


In [199]:
def update_city_weather_with_forecasts(city_weather, weather_forecasts):
    """
    Обновляет словарь city_weather, добавляя прогнозы от модели в таблицы каждого города.
    Убираем сезонные признаки и конкатенируем данные с прогнозами.
    """
    for city_id, forecast_df in weather_forecasts.items():
        # Получаем исходный DataFrame для города
        city_df = city_weather[city_id]
        
        # Убираем сезонные признаки (month_sin, month_cos, doy_sin, doy_cos)
        city_df = city_df.drop(columns=['month_sin', 'month_cos', 'doy_sin', 'doy_cos'], errors='ignore')
        
        # Округляем прогнозы до 1 знака после запятой
        target_columns = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd']
        forecast_df[target_columns] = forecast_df[target_columns].round(1)
        
        # Конкатенируем данные погоды с прогнозом
        updated_city_df = pd.concat([city_df, forecast_df], axis=0, ignore_index=True)
        
        # Обновляем словарь
        city_weather[city_id] = updated_city_df
    
    return city_weather


In [200]:
city_weather = update_city_weather_with_forecasts(city_weather, weather_forecasts)
city_weather[1].tail(n=40)

Unnamed: 0,time,city_id,tavg,tmin,tmax,prcp,wspd
1054,2025-03-20,1,2.1,-1.4,8.6,1.1,4.7
1055,2025-03-21,1,3.8,-1.0,9.9,0.55,4.5
1056,2025-03-22,1,2.2,0.0,4.6,0.0,5.5
1057,2025-03-23,1,1.3,-4.7,5.7,0.0,2.9
1058,2025-03-24,1,4.6,-2.4,10.7,0.0,5.0
1059,2025-03-25,1,4.9,3.0,6.7,3.9,6.9
1060,2025-03-26,1,3.4,1.8,5.9,0.0,1.6
1061,2025-03-27,1,5.5,2.6,7.0,1.9,1.8
1062,2025-03-28,1,6.6,4.1,9.6,0.0,1.8
1063,2025-03-29,1,8.0,2.2,12.9,0.0,1.8
