In [1]:
import pandas as pd
import numpy as np

from typing import Callable

In [2]:
data = pd.read_csv("train.csv")

targets = data.columns[-3:]

In [3]:
def window_sliding(data: pd.DataFrame, features: dict[str, list[tuple]], in_place=False):
    if not in_place:
        data = data.copy()
    
    for feature, functions in features.items():
        for func, sizes in functions.items():
            for w_size in sizes:
                rolling = data[feature].rolling(window=w_size, min_periods=1)
                data[f"{feature}_{func.__name__}_{w_size}h"] = rolling.agg(func)
    
    if not in_place:
        return data

In [4]:
data_cp = data.copy()

Зададим размеры окон в часах, основываясь на выделенных периодах. Воспользуемся функциями min, max, std. При необходимости можно изменить периоды или функции.

In [5]:
window_sliding(data_cp, dict(zip(list(targets), [{np.max: [24, 12, 8, 6], np.min: [24, 12, 8, 6], np.std: [24, 12, 8, 6]}, 
                                                           {np.max: [24, 12, 8, 6], np.min: [24, 12, 8, 6], np.std: [24, 12, 8, 6]}, 
                                                           {np.max: [24, 12, 8, 6], np.min: [24, 12, 8, 6], np.std: [24, 12, 8, 6]}])), in_place=True)

In [6]:
data_cp.head(5)

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,...,target_nitrogen_oxides_amax_8h,target_nitrogen_oxides_amax_6h,target_nitrogen_oxides_amin_24h,target_nitrogen_oxides_amin_12h,target_nitrogen_oxides_amin_8h,target_nitrogen_oxides_amin_6h,target_nitrogen_oxides_std_24h,target_nitrogen_oxides_std_12h,target_nitrogen_oxides_std_8h,target_nitrogen_oxides_std_6h
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,...,167.7,167.7,167.7,167.7,167.7,167.7,,,,
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,...,167.7,167.7,98.9,98.9,98.9,98.9,48.648947,48.648947,48.648947,48.648947
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,...,167.7,167.7,98.9,98.9,98.9,98.9,34.585739,34.585739,34.585739,34.585739
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,...,177.2,177.2,98.9,98.9,98.9,98.9,36.409923,36.409923,36.409923,36.409923
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,...,177.2,177.2,98.9,98.9,98.9,98.9,32.89123,32.89123,32.89123,32.89123


Также можно продифферецнировать ряд

In [7]:
def diff(targets_with_n: dict):
    for t, n in targets_with_n.items():
        data_cp[f'{t}_diff'] = data_cp[f'{t}'].diff(n)

In [8]:
diff(dict(zip(targets, [1, 1, 1])))

Также сделаем отдельные параметры для дня, месяца, года. Также сделаем отдельный параметр для сезонов, как это было сделано в EDA

In [9]:
data_cp.date_time = pd.to_datetime(data.date_time)

In [10]:
interval_1 = data.date_time < data.date_time[3450]
interval_2 = (data.date_time > data.date_time[3450]) & (data.date_time < data.date_time[4200])
interval_3 = data.date_time > data.date_time[4200]

data_cp["season"] = np.nan
data_cp.loc[interval_1, 'season'], data_cp.loc[interval_2, 'season'], data_cp.loc[interval_3, 'season'] = 1, 2, 3

In [11]:
data_cp["date_time_hour"] = data_cp.date_time.dt.hour
data_cp["date_time_day"] = data_cp.date_time.dt.day
data_cp["date_time_month"] = data_cp.date_time.dt.month
data_cp["date_time_year"] = data_cp.date_time.dt.year

In [12]:
data_cp.iloc[:, 9:].head(5)

Unnamed: 0,target_carbon_monoxide,target_benzene,target_nitrogen_oxides,target_carbon_monoxide_amax_24h,target_carbon_monoxide_amax_12h,target_carbon_monoxide_amax_8h,target_carbon_monoxide_amax_6h,target_carbon_monoxide_amin_24h,target_carbon_monoxide_amin_12h,target_carbon_monoxide_amin_8h,...,target_nitrogen_oxides_std_8h,target_nitrogen_oxides_std_6h,target_carbon_monoxide_diff,target_benzene_diff,target_nitrogen_oxides_diff,season,date_time_hour,date_time_day,date_time_month,date_time_year
0,2.5,12.0,167.7,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,,,,,,1.0,18,10,3,2010
1,2.1,9.9,98.9,2.5,2.5,2.5,2.5,2.1,2.1,2.1,...,48.648947,48.648947,-0.4,-2.1,-68.8,1.0,19,10,3,2010
2,2.2,9.2,127.1,2.5,2.5,2.5,2.5,2.1,2.1,2.1,...,34.585739,34.585739,0.1,-0.7,28.2,1.0,20,10,3,2010
3,2.2,9.7,177.2,2.5,2.5,2.5,2.5,2.1,2.1,2.1,...,36.409923,36.409923,0.0,0.5,50.1,1.0,21,10,3,2010
4,1.5,6.4,121.8,2.5,2.5,2.5,2.5,1.5,1.5,1.5,...,32.89123,32.89123,-0.7,-3.3,-55.4,1.0,22,10,3,2010


Также нам может понадобиться масштабирование признаков. Если этого не сделать, то в случае применения регуляризации модель будет работать хуже, так как у малых по модулю признаков, возможно, будут большие веса, а регуляризация "штрафует" за это. Также масштабирование повысит интерпретируемость, так как веса можно будет понимать как важность признака.

In [13]:
features = list(data_cp.columns[1:9]) + list(data_cp.columns[12:])

In [14]:
from sklearn.preprocessing import StandardScaler

X, Y = data_cp.dropna()[features], data_cp.dropna()[targets]

scaler = StandardScaler()
features_scaled = scaler.fit_transform(X, Y)

In [22]:
%store features_scaled
%store Y

Stored 'features_scaled' (ndarray)
Stored 'Y' (DataFrame)


In [16]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from validation import TimeSeriesWalkingForwardCV, TimeSeriesWindowCV

optimizer_ridge = TimeSeriesWindowCV(Ridge, 24*30, 24*10)
optimizer_lasso = TimeSeriesWindowCV(Lasso, 24*30, 24*10)
optimizer_DT = TimeSeriesWindowCV(DecisionTreeRegressor, 24*30, 24*10)

In [17]:
optimizer_ridge.fit(features_scaled, np.array(Y))
optimizer_lasso.fit(features_scaled, np.array(Y))
optimizer_DT.fit(features_scaled, np.array(Y))

In [18]:
print(optimizer_ridge.mean_error(),
      optimizer_lasso.mean_error(),
      optimizer_DT.mean_error())

[1.72632949e-01 2.43265028e+00 2.97313736e+03] [1.41056028e+00 5.57926830e+00 3.32416983e+03] [5.86793269e-01 1.07003574e+01 8.46263260e+03]


In [19]:
optimizer_ridge = TimeSeriesWalkingForwardCV(Ridge, test_size=24*10, n_splits=10)
optimizer_lasso = TimeSeriesWalkingForwardCV(Lasso, test_size=24*10, n_splits=10)
optimizer_DT = TimeSeriesWalkingForwardCV(DecisionTreeRegressor, test_size=24*10, n_splits=10)

optimizer_ridge.fit(features_scaled, np.array(Y))
optimizer_lasso.fit(features_scaled, np.array(Y))
optimizer_DT.fit(features_scaled, np.array(Y))

In [20]:
print(optimizer_ridge.mean_error(),
      optimizer_lasso.mean_error(),
      optimizer_DT.mean_error())

[2.43362732e-01 3.50172389e+00 7.05859469e+03] [2.94978265e+00 8.24545517e+00 8.26215260e+03] [1.01380417e+00 2.21091167e+01 1.39082563e+04]


In [21]:
optimizer_ridge.errors

[array([1.28124947e-01, 2.85165764e+00, 3.89915802e+03]),
 array([1.06179639e-01, 2.78537481e+00, 2.18558490e+03]),
 array([1.43330883e-01, 3.80790257e+00, 2.09702405e+03]),
 array([1.33790713e-01, 2.67561347e+00, 4.72870642e+03]),
 array([1.64626099e-01, 3.74577438e+00, 6.57530096e+03]),
 array([1.63526608e-01, 2.23798424e+00, 8.37091976e+03]),
 array([4.75526300e-01, 8.19280866e+00, 1.59468584e+04]),
 array([1.72658176e-01, 3.66617998e+00, 4.51670657e+03]),
 array([7.09392163e-01, 2.38652503e+00, 1.80156329e+04]),
 array([2.36471798e-01, 2.66741816e+00, 4.25005491e+03])]