In [3]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import datetime
# auto reolad zewnetrznych skryptow bez resetowania notebooka
import sys
sys.path.append('..')
import models
from utils.modeling_utils import SequenceGeneratorCVPCA

%reload_ext autoreload
%autoreload 2

DEVICE = "cpu"

# Data prep, PCA EXAMPLE

In [6]:
df = pd.read_csv('../../data/csv/six_cities.csv')
df['dt'] = pd.to_datetime(df['dt'])

# tylko warszawa
df = df[df['city'].isin(['Warszawa'])]
df = df.reset_index(drop=True)
#df.head()

In [11]:
# na podstawie 96 godzin predykcja na dwa nastepne dni
input_width, output_width = 96, 48

# one moga zostac bez problemu jako stala czy cos, maja rozklady bardzo nienormalne
normalize_features = ['humidity', 'clouds.all', 'rain.1h', 'snow.1h', 'co', 'no', 'no2', 'so2', 'pm2_5', 'pm10', 'nh3']

# zmienne numeryczne ktore chcemy wykorzystac w modelu
numeric_features = ['day_sin', 'day_cos', 'week_sin', 'week_cos', 'month_sin', 'month_cos', 'co', 'no','temp', 'humidity']

# zmienne kategoryczne ktore chcemy wykorzystac w modelu
categorical_features = ['state']

# zmienne ktorych chcemy robic predykcje
output_columns = ['co', 'no']

seq_gen = SequenceGeneratorCVPCA(
    numeric_features = numeric_features, 
    categorical_features = categorical_features,
    output_features = output_columns,
    normalize_features = normalize_features,
    device = DEVICE 
)
seq_gen.init_preprocessor(df[:10])

In [13]:
cities_dfs, cv_indices = seq_gen.split_data(df, 5, False)
for idx_train, idx_val in cv_indices:
    df_train = pd.concat([city.loc[idx_train, :] for city in cities_dfs], axis=0)
    df_val = pd.concat([city.loc[idx_val, :] for city in cities_dfs], axis=0)
    df_train = seq_gen.preprocessor.fit_transform(df_train)
    df_val = seq_gen.preprocessor.transform(df_val)

    dataloader_train, dataloader_val = seq_gen.get_dataloaders(df_train, df_val)
    break

In [14]:
ar_model = models.ARmodel(
    seq_gen = seq_gen
)

for X, y in dataloader_train:
    y_p = ar_model(X)
    print(X.shape, y_p.shape, y.shape)
    break

torch.Size([144, 96, 11]) torch.Size([144, 48, 2]) torch.Size([144, 48, 2])


In [15]:
df_train

Unnamed: 0,city,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,co,no,state_Masovian Voivodeship
0,Warszawa,-0.008603,3.151141,1.612999,0.758915,1.408700,-0.332637,0.518047,0.330480,0.659880,-0.402800,2.258471,1.987179,1.0
1,Warszawa,-0.638559,3.094522,1.591384,0.784115,1.445006,-0.366114,0.521170,0.150802,0.731470,-0.490261,2.290676,1.997972,1.0
2,Warszawa,-1.111343,3.045774,1.518638,0.815565,1.474274,-0.392869,0.542560,0.078988,0.657700,-0.576515,2.298846,1.999857,1.0
3,Warszawa,-1.693231,2.954408,1.327948,0.871642,1.507020,-0.414234,0.543939,-0.021327,0.692623,-0.630994,2.278772,1.991655,1.0
4,Warszawa,-1.899948,2.874522,1.131289,0.926430,1.519279,-0.427570,0.573947,0.051334,0.469887,-0.695645,2.206231,1.965591,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851,Warszawa,-0.472328,-1.720549,1.102393,-1.150213,1.054912,-0.510412,1.022675,0.244743,-0.036680,-0.370738,-0.788604,-1.164388,1.0
1852,Warszawa,-0.855199,-1.790900,0.868600,-1.084398,1.084165,-0.504207,1.051571,0.241565,-0.115864,-0.420113,-0.831032,-1.164388,1.0
1853,Warszawa,-1.109377,-1.884641,0.570408,-0.999899,1.108486,-0.492725,1.065158,0.235259,-0.216885,-0.446184,-0.918778,-1.164388,1.0
1854,Warszawa,-1.263076,-1.949570,0.226549,-0.906133,1.122771,-0.477079,1.086073,0.249867,-0.301821,-0.453866,-1.010586,-1.135279,1.0
