### Дополняем таблицу заказов недостающими парами заказ-дата

In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('./data/train-clean.csv', parse_dates=['date'])

display(train_df.head())
display(train_df.info())

Unnamed: 0,date,store_code,store_item_code,store_nbr,item_nbr,units,units_yesterday,units_prev_week,rolling_mean_4w
0,2012-01-01,s1,1-28,1,28,2,0.0,0.0,2.0
1,2012-01-01,s1,1-40,1,40,0,0.0,0.0,0.0
2,2012-01-01,s1,1-47,1,47,0,0.0,0.0,0.0
3,2012-01-01,s1,1-51,1,51,1,0.0,0.0,1.0
4,2012-01-01,s1,1-89,1,89,0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236038 entries, 0 to 236037
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             236038 non-null  datetime64[ns]
 1   store_code       236038 non-null  object        
 2   store_item_code  236038 non-null  object        
 3   store_nbr        236038 non-null  int64         
 4   item_nbr         236038 non-null  int64         
 5   units            236038 non-null  int64         
 6   units_yesterday  236038 non-null  float64       
 7   units_prev_week  236038 non-null  float64       
 8   rolling_mean_4w  236038 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(2)
memory usage: 16.2+ MB


None

In [28]:
# Построим полный календарь «дата × store_item_code»
full_calendar = (
    pd.MultiIndex.from_product(
        [pd.date_range(train_df.date.min(), train_df.date.max(), freq='D'),
         train_df.store_item_code.unique()],
        names=['date', 'store_item_code']
    )
    .to_frame(index=False)
)

In [31]:
# Расклеиваем store_nbr / item_nbr для склеивания
full_calendar[['store_nbr','item_nbr']] = (
    full_calendar['store_item_code']
        .str.extract(r'(\d+)-(\d+)')
        .astype(int)
)

In [32]:
# Джойним с исходными данными
merged = (
    full_calendar
        .merge(train_df, on=['date','store_item_code','store_nbr','item_nbr'], how='left')
        .sort_values(['date','store_nbr','item_nbr'])
        .reset_index(drop=True)
)

In [34]:
# Где продаж не было — заполняем нулями (и зависимые лаг‑признаки нулями/NaN)
num_cols = ['units','units_yesterday','units_prev_week','rolling_mean_4w']
for c in num_cols:
    if c in merged.columns:
        merged[c] = merged[c].fillna(0)

merged['store_code'] = 's' + merged['store_nbr'].astype(str)

# Результат
print(f"Строк было: {len(train_df):,}  →  стало: {len(merged):,}")
merged.to_csv('./data/train_additional.csv', index=False)

Строк было: 236,038  →  стало: 263,925


### Базовая погода

In [9]:
import pandas as pd

# Читаем погоду
weather_df = pd.read_csv('./data/weather-clean.csv', parse_dates=['date'])

# Сортируем по станции и дате (для корректного rolling)
weather_df = weather_df.sort_values(['station_nbr', 'date'])

# 14‑дневные скользящие средние температур
temp_cols = ['tmax', 'tmin', 'tavg']
for c in temp_cols:
    weather_df[f'{c}_roll14'] = (
        weather_df.groupby('station_nbr')[c]
               .transform(lambda x: x.rolling(14, min_periods=1).mean())
    )

# Формируем «базовый» сценарий без осадков
weather_base = weather_df.copy()

# Обнуляем осадки и снег
weather_base[['snowfall', 'preciptotal']] = 0

# Убираем текстовый код погоды; можно удалить, оставить пустым или пометить "—"
weather_base['codesum'] = '-'

# Оставляем только новые сглаженные температуры
weather_base = weather_base[
    ['station_nbr', 'date',
     'tmax_roll14', 'tmin_roll14', 'tavg_roll14',
     'snowfall', 'preciptotal',
     'sunrise', 'sunset',
     'depart', 'dewpoint', 'wetbulb', 'heat', 'cool',
     'stnpressure', 'sealevel',
     'resultspeed', 'resultdir', 'avgspeed',
     'codesum']
]
weather_base = weather_base.rename(columns={'tmax_roll14':'tmax','tmin_roll14':'tmin','tavg_roll14':'tavg'})

# Сохраняем
weather_base.to_csv('./data/weather_base.csv', index=False)
print('weather_base готов:', weather_base.shape)


weather_base готов: (20517, 20)


<span style="color: red;">Перегенерировать файл weather_base_full.csv</span>

### Делаем большую прогнозную таблицу
В этом случае связываемся с таблицей train_additional.csv, там заполнены пропуски по датам.

In [16]:
import pandas as pd
import numpy as np

weather_df = pd.read_csv('./data/weather-clean.csv')
key_df = pd.read_csv('./data/key.csv')
train_df = pd.read_csv('./data/train_additional.csv')

In [35]:
# 1) store  →  station  (ключ)
train_plus_station = train_df.merge(
    key_df,              # содержит соответствие store ↔ station
    on='store_nbr',
    how='left'
)

# 2) добавляем погоду (по station + date)
train_weather_df = train_plus_station.merge(
    weather_df,
    on=['station_nbr', 'date'],   # date должна быть в обоих dfs
    how='left'
)

# cols_X = [
#     # таргет
#     # "units",

#     # прошлое
#     "units_yesterday", "units_prev_week", "rolling_mean_4w",

#     # категориальные
#     "store_code", "store_item_code",

#     # погода (float32)
#     "tmax", "tmin", "tavg", "depart", "dewpoint", "wetbulb", "heat", "cool",
#     "sunrise", "sunset",
#     "snowfall", "preciptotal", "stnpressure", "sealevel",
#     "resultspeed", "resultdir", "avgspeed",

#     # календарь и флаги (int16)
#     "year", "week", "BCFG", "BLDU", "BLSN", "BR", "DU", "DZ", "FG", "FU",
#     "FZDZ", "FZFG", "FZRA", "GR", "GS", "HZ", "MIFG", "PL", "PRFG", "RA",
#     "SG", "SN", "SQ", "TS", "TSRA", "TSSN", "UP", "VCFG", "VCTS",
#     "day_of_week", "month", "is_weekend", "is_holiday",
#     "rain_streak", "dry_streak",

#     # look‑ahead
#     "avg_temp_next_day", "rain_next_day", "days_to_holiday"
# ]

cols_Y = [
  'units', 'units_pred'
]
cat_cols = ["store_code", "store_item_code"]
service_cols = ['date']
# train_weather_df['units_pred'] = None
# train_weather_df = train_weather_df[cols_X + cols_Y]


In [36]:
from catboost import CatBoostRegressor

model = CatBoostRegressor()
model.load_model(f"../ml-models/CatBoost v1.cbm")


<catboost.core.CatBoostRegressor at 0x7f9285b8da60>

In [19]:
print(model.feature_names_)

['store_code', 'store_item_code', 'units_yesterday', 'units_prev_week', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir', 'avgspeed', 'year', 'week', 'BCFG', 'BLDU', 'BLSN', 'BR', 'DU', 'DZ', 'FG', 'FU', 'FZDZ', 'FZFG', 'FZRA', 'GR', 'GS', 'HZ', 'MIFG', 'PL', 'PRFG', 'RA', 'SG', 'SN', 'SQ', 'TS', 'TSRA', 'TSSN', 'UP', 'VCFG', 'VCTS', 'day_of_week', 'month', 'is_weekend', 'is_holiday', 'rain_streak', 'dry_streak', 'avg_temp_next_day', 'rain_next_day', 'days_to_holiday']


In [34]:
train_weather_df#[model.feature_names_]

Unnamed: 0,date,store_item_code,store_nbr,item_nbr,store_code,units,units_yesterday,units_prev_week,rolling_mean_4w,station_nbr,...,day_of_week,month,is_weekend,is_holiday,rain_streak,dry_streak,avg_temp_next_day,rain_next_day,days_to_holiday,units_pred
0,2012-01-01,1-9,1,9,s1,29.0,0.0,0.0,29.000000,1,...,6.0,1.0,1.0,0.0,0.0,0.0,50.0,0.0,1.0,132.212478
1,2012-01-01,1-28,1,28,s1,2.0,0.0,0.0,2.000000,1,...,6.0,1.0,1.0,0.0,0.0,0.0,50.0,0.0,1.0,132.212478
2,2012-01-01,1-40,1,40,s1,0.0,0.0,0.0,0.000000,1,...,6.0,1.0,1.0,0.0,0.0,0.0,50.0,0.0,1.0,132.212478
3,2012-01-01,1-47,1,47,s1,0.0,0.0,0.0,0.000000,1,...,6.0,1.0,1.0,0.0,0.0,0.0,50.0,0.0,1.0,132.212478
4,2012-01-01,1-51,1,51,s1,1.0,0.0,0.0,1.000000,1,...,6.0,1.0,1.0,0.0,0.0,0.0,50.0,0.0,1.0,132.212478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263920,2014-10-31,45-9,45,9,s45,10.0,12.0,7.0,19.642857,16,...,4.0,10.0,0.0,0.0,0.0,8.0,47.0,1.0,365.0,75.291906
263921,2014-10-31,45-22,45,22,s45,0.0,2.0,0.0,1.000000,16,...,4.0,10.0,0.0,0.0,0.0,8.0,47.0,1.0,365.0,68.575376
263922,2014-10-31,45-26,45,26,s45,1.0,3.0,0.0,1.250000,16,...,4.0,10.0,0.0,0.0,0.0,8.0,47.0,1.0,365.0,68.645795
263923,2014-10-31,45-34,45,34,s45,0.0,0.0,0.0,0.071429,16,...,4.0,10.0,0.0,0.0,0.0,8.0,47.0,1.0,365.0,67.553880


In [38]:
train_weather_df[model.feature_names_][train_weather_df[model.feature_names_].isna().any(axis=1)]

Unnamed: 0,store_code,store_item_code,units_yesterday,units_prev_week,tmax,tmin,tavg,depart,dewpoint,wetbulb,...,VCTS,day_of_week,month,is_weekend,is_holiday,rain_streak,dry_streak,avg_temp_next_day,rain_next_day,days_to_holiday
192,s35,35-16,0.0,0.0,,,,,,,...,,,,,,,,,,
193,s35,35-24,0.0,0.0,,,,,,,...,,,,,,,,,,
194,s35,35-49,0.0,0.0,,,,,,,...,,,,,,,,,,
195,s35,35-50,0.0,0.0,,,,,,,...,,,,,,,,,,
196,s35,35-63,0.0,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155235,s35,35-50,0.0,0.0,,,,,,,...,,,,,,,,,,
155236,s35,35-63,0.0,0.0,,,,,,,...,,,,,,,,,,
155237,s35,35-66,0.0,0.0,,,,,,,...,,,,,,,,,,
155238,s35,35-93,0.0,0.0,,,,,,,...,,,,,,,,,,


In [39]:
train_weather_df = train_weather_df.dropna()

# Предсказываем.
Y_production_proba = model.predict(train_weather_df[model.feature_names_])

In [40]:
train_weather_df['units_pred'] = Y_production_proba
train_weather_df[service_cols + model.feature_names_ + cols_Y].to_csv('./data/prediction_real_weather.csv', index=False)

### Аналогичный датасет, но с дефолтной погодой
-----------

In [53]:
import pandas as pd
import numpy as np

weather_base_df = pd.read_csv('./data/weather_base_full.csv')
key_df = pd.read_csv('./data/key.csv')
train_df = pd.read_csv('./data/train_additional.csv')

In [54]:
# 1) store  →  station  (ключ)
train_plus_station = train_df.merge(
    key_df,              # содержит соответствие store ↔ station
    on='store_nbr',
    how='left'
)

# 2) добавляем погоду (по station + date)
train_weather_df2 = train_plus_station.merge(
    weather_base_df,
    on=['station_nbr', 'date'],   # date должна быть в обоих dfs
    how='left'
)

train_weather_df2[['BCFG', 'BLDU', 'BLSN', 'FZDZ', 'FZFG', 'FZRA', 'MIFG', 'PRFG', 'TSRA', 'TSSN', 'VCFG', 'VCTS']] = 0
train_weather_df2['codesum'] = ''
# train_weather_df2 = train_weather_df2[model.feature_names_ + cols_Y]


In [55]:
train_weather_df2[model.feature_names_][train_weather_df2[model.feature_names_].isna().any(axis=1)]
train_weather_df2 = train_weather_df2.dropna()

# Предсказываем.
Y_production_proba2 = model.predict(train_weather_df2[model.feature_names_])

In [15]:
train_weather_df2['units_pred'] = Y_production_proba2
train_weather_df2[service_cols + model.feature_names_ + cols_Y].to_csv('./data/prediction_base_weather.csv', index=False)