In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

# Проверка доступности GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [8]:
df = pd.read_csv("weather_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1305820 entries, 0 to 1305819
Data columns (total 13 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   weather_id  1305820 non-null  int64  
 1   city_id     1305820 non-null  int64  
 2   time        1305820 non-null  object 
 3   tavg        1207223 non-null  float64
 4   tmin        1217650 non-null  float64
 5   tmax        1216483 non-null  float64
 6   prcp        1048725 non-null  float64
 7   snow        248792 non-null   float64
 8   wdir        651699 non-null   float64
 9   wspd        824380 non-null   float64
 10  wpgt        18196 non-null    float64
 11  pres        403416 non-null   float64
 12  tsun        0 non-null        float64
dtypes: float64(10), int64(2), object(1)
memory usage: 129.5+ MB


In [7]:
df[1300000:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5820 entries, 1300000 to 1305819
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   weather_id  5820 non-null   int64  
 1   city_id     5820 non-null   int64  
 2   time        5820 non-null   object 
 3   tavg        5812 non-null   float64
 4   tmin        5820 non-null   float64
 5   tmax        5820 non-null   float64
 6   prcp        5482 non-null   float64
 7   snow        1428 non-null   float64
 8   wdir        4779 non-null   float64
 9   wspd        5802 non-null   float64
 10  wpgt        0 non-null      float64
 11  pres        2140 non-null   float64
 12  tsun        0 non-null      float64
dtypes: float64(10), int64(2), object(1)
memory usage: 591.2+ KB


In [18]:
def clean_data(df):
    # Создаем копию DataFrame, чтобы избежать SettingWithCopyWarning
    df = df.copy()
    
    # 1. Обработка времени
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].dt.month
    df['day_of_year'] = df['time'].dt.dayofyear
    
    # 2. Удаление бесполезных столбцов
    df = df.drop(columns=['tsun', 'wpgt', 'wdir'])
    
    # 3. Заполнение пропусков
    df = df.sort_values(by=['city_id', 'time'])
    for col in ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wspd', 'pres']:
        df[col] = df.groupby('city_id')[col].transform(lambda x: x.interpolate(method='linear'))
        df[col] = df.groupby(['city_id', 'month'])[col].transform(lambda x: x.fillna(x.median()))
    
    # 4. Оптимизация типов данных
    # Приводим float-колонки к float32 (если не нужна высокая точность)
    float_cols = ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wspd', 'pres']
    df[float_cols] = df[float_cols].astype('float32')
    
    # city_id и weather_id тоже можно привести к оптимальному типу
    if 'weather_id' in df.columns:
        df['weather_id'] = df['weather_id'].astype('int32')
    df['city_id'] = df['city_id'].astype('int32')
    
    # 5. Удаление строк с оставшимися пропусками
    df = df.dropna()
    
    return df

In [28]:
df_cleaned = clean_data(df[1200000:].copy())
df_cleaned.info()

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


<class 'pandas.core.frame.DataFrame'>
Index: 98520 entries, 1200000 to 1305819
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   weather_id   98520 non-null  int32         
 1   city_id      98520 non-null  int32         
 2   time         98520 non-null  datetime64[ns]
 3   tavg         98520 non-null  float32       
 4   tmin         98520 non-null  float32       
 5   tmax         98520 non-null  float32       
 6   prcp         98520 non-null  float32       
 7   snow         98520 non-null  float32       
 8   wspd         98520 non-null  float32       
 9   pres         98520 non-null  float32       
 10  month        98520 non-null  int32         
 11  day_of_year  98520 non-null  int32         
dtypes: datetime64[ns](1), float32(7), int32(4)
memory usage: 5.6 MB


In [29]:
# Предобработка данных
def prepare_data(df, sequence_length=7, target_length=7):
    # Разделение на обучающую и тестовую выборки
    train_size = int(0.8 * len(df))
    train_data = df.iloc[:train_size]
    test_data = df.iloc[train_size:]
    
    # Нормализация данных
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_data.drop(columns=['time', 'city_id', 'weather_id']))
    test_scaled = scaler.transform(test_data.drop(columns=['time', 'city_id', 'weather_id']))
    
    # Создание последовательностей
    def create_sequences(data, sequence_length, target_length):
        X, y = [], []
        for i in range(len(data) - sequence_length - target_length + 1):
            X.append(data[i:i+sequence_length])
            y.append(data[i+sequence_length:i+sequence_length+target_length])
        return np.array(X), np.array(y)
    
    X_train, y_train = create_sequences(train_scaled, sequence_length, target_length)
    X_test, y_test = create_sequences(test_scaled, sequence_length, target_length)
    
    return X_train, y_train, X_test, y_test, scaler

# Подготовка данных
X_train, y_train, X_test, y_test, scaler = prepare_data(df_cleaned)

# Преобразование в тензоры PyTorch
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).to(device)

# Создание DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [30]:
for a, b in train_loader:
    print(a.shape,b.shape)
    break

torch.Size([32, 7, 9]) torch.Size([32, 7, 9])


In [31]:
 # Определение модели
class WeatherLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, sequence_length):
        super(WeatherLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = out.reshape(out.size(0), -1)  # Выравнивание для полносвязного слоя
        out = self.fc(out)
        return out


In [32]:
# Параметры модели
input_size = X_train.shape[2]  # Количество признаков
hidden_size = 64
num_layers = 2
output_size = y_train.shape[1] * y_train.shape[2]  # Количество прогнозируемых значений
sequence_length = X_train.shape[1]

# Инициализация модели, функции потерь и оптимизатора
model = WeatherLSTM(input_size, hidden_size, num_layers, output_size, sequence_length).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [33]:
# Обучение модели
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.reshape(batch_y.size(0), -1))
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/50], Loss: 0.3336
Epoch [2/50], Loss: 0.2759
Epoch [3/50], Loss: 0.2513
Epoch [4/50], Loss: 0.2393
Epoch [5/50], Loss: 0.2530
Epoch [6/50], Loss: 0.1308
Epoch [7/50], Loss: 0.2700
Epoch [8/50], Loss: 0.1461
Epoch [9/50], Loss: 0.2504
Epoch [10/50], Loss: 0.3795
Epoch [11/50], Loss: 0.2230
Epoch [12/50], Loss: 0.4143
Epoch [13/50], Loss: 0.1432
Epoch [14/50], Loss: 0.2002
Epoch [15/50], Loss: 0.1444
Epoch [16/50], Loss: 0.1763
Epoch [17/50], Loss: 0.1479
Epoch [18/50], Loss: 0.1827
Epoch [19/50], Loss: 0.1978
Epoch [20/50], Loss: 0.2473
Epoch [21/50], Loss: 0.1700
Epoch [22/50], Loss: 0.3610
Epoch [23/50], Loss: 0.3043
Epoch [24/50], Loss: 0.1277
Epoch [25/50], Loss: 0.1947
Epoch [26/50], Loss: 0.2027
Epoch [27/50], Loss: 0.1488
Epoch [28/50], Loss: 0.2487
Epoch [29/50], Loss: 0.1832
Epoch [30/50], Loss: 0.1693
Epoch [31/50], Loss: 0.1765
Epoch [32/50], Loss: 0.1367
Epoch [33/50], Loss: 0.2950
Epoch [34/50], Loss: 0.1201
Epoch [35/50], Loss: 0.1548
Epoch [36/50], Loss: 0.2204
E

In [27]:
# Оценка модели
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test.reshape(y_test.size(0), -1))
    print(f'Test Loss: {test_loss.item():.4f}')

Test Loss: 1.0868
