# Работа с временными рядами

**Временной ряд** - последовательно измеренные через некоторые (зачастую равные) промежутки времени данные.

Обычно в задачах анализа данных предполагается независимость наблюдений. При работе с временными рядами мы строим свой прогноз на основе наблюдений из прошлого.

Компоненты временного ряда:
- Тренд - систематическая линейная или нелинейная компонента, изменяющаяся во времени.
- Сезонность - периодические колебания уровней временного ряда внутри года (например).
- Цикл - периодические колебания, как правило, больше, чем один сезоннный период, не имеют определенной продолжительности.
- Ошибка - непрогнозируемая компонента.
- Уровень - среднее значение временного ряда.

<img width = '800px' src="images/img-2022-02-14-14-55-39.png">

In [2]:
import pandas as pd 
import datetime
import numpy as np 
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from pylab import rcParams
import warnings
from pandas.core.nanops import nanmean as pd_nanmean

from sklearn.metrics import mean_absolute_error

warnings.filterwarnings('ignore')
%matplotlib inline

In [276]:
data = pd.read_csv('train.csv', sep =',',
                                 parse_dates=['date'], index_col='date')
data.sort_index(inplace=True)
X_test = data[data.index > '2021-04-01']
X_test

Unnamed: 0_level_0,warehouse_id,product_id,quantity,id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-02,1,88360,1,65975
2021-04-02,1,95230,1,80331
2021-04-02,0,87980,2,20579
2021-04-02,1,96850,2,82369
2021-04-02,1,94815,4,79600
...,...,...,...,...
2021-04-08,0,96105,1,30789
2021-04-08,0,96275,1,30855
2021-04-08,0,96345,1,30881
2021-04-08,0,95750,1,30377


In [268]:
data = pd.read_csv('train.csv', sep =',',
                                 parse_dates=['date'])

In [263]:
data_test = pd.read_csv('test.csv', sep =',',
                                 parse_dates=['date'], index_col='date')
data_test.sort_index(inplace=True)
test_X = data_test
test_X

Unnamed: 0_level_0,product_id,warehouse_id,id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-04-09,71165,0,0
2021-04-09,71165,1,1
2021-04-09,71170,0,2
2021-04-09,71170,1,3
2021-04-09,71185,0,4
...,...,...,...
2021-04-15,98615,1,46027
2021-04-15,98620,0,46028
2021-04-15,98620,1,46029
2021-04-15,98635,0,46030


In [251]:
data_products[1][71350]

KeyError: 71350

In [271]:
y_test = []
for ind, row in test_X.iterrows():
    try:
        res = exponential_smoothing(data_products[row['warehouse_id']][row['product_id']], 0.1)
        np.append(data_products[row['warehouse_id']][row['product_id']], res)
    except KeyError:
        res = 0
    
    y_test.append(res)

In [272]:
sum(y_test)

9525.21531226523

In [260]:
sum(y_test)

9525.21531226523

In [274]:
import csv
f = open('test_ans2.csv', 'w')
# create the csv writer

# write a row to the csv file
f.write("id,quantity\n")
for i in range(0, 46032):
    #row = "% s" % i % (y_pred[i - 11691])
    row = str(i) + ',' + str(int(y_test[i]))
    f.write(row)
    f.write("\n")

In [None]:
for ind, row in X_test.iterrows():
    print(row['quantity'])
    print('hello')

In [None]:
data_0 = data[data['warehouse_id'] == 0]
print(data_0[data_0['product_id'] == 71170])

In [147]:
data_0['date']

id
0       2020-12-02
1       2020-12-03
2       2020-12-07
3       2020-12-08
4       2020-12-09
           ...    
31805   2021-04-07
31812   2021-04-06
31816   2021-04-07
31817   2021-04-08
31822   2021-04-06
Name: date, Length: 29792, dtype: datetime64[ns]

In [266]:
data_products = {}
def separate_products(i):
    data_products[i] = {}
    data_temp = data[data['warehouse_id'] == i]
    for j in data_temp['product_id']:
        df =  data_temp[data_temp['product_id'] == j]
        #print(df)
        # my_dates = pd.date_range('2020-12-02', '2021-04-08', freq='D')

        # idx = pd.MultiIndex.from_product([my_dates], names=['date'])
        df = df.set_index(['date']).reindex(pd.date_range(start='2020-12-02', end='2021-04-08',freq='D')).fillna(0).reset_index()
        
        # df.set_index(df['date'])
        data_products[i][j] = df['quantity'].to_numpy()
        # print(df)
    print(i)

In [85]:
from tqdm import tqdm
from multiprocessing.dummy import Pool, Queue

In [269]:
separate_products(0)
separate_products(1)

0
1


In [219]:
data_products[0][71170].to_numpy()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 3., 3., 3., 0., 2., 1., 3., 2., 4., 0., 2., 0., 0., 1., 0.,
       0., 1., 2., 3., 2., 1., 2., 3., 0., 7., 4., 3., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 6., 4., 5., 5., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [270]:
def exponential_smoothing(series, alpha):
    result = [series[0]] 
    
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result[-1]

In [227]:
y_pred = []

In [238]:
y_pred = []
for ind, row in X_test.iterrows():
    res = exponential_smoothing(data_products[row['warehouse_id']][row['product_id']][:-7], 0.1)
    np.append(data_products[row['warehouse_id']][row['product_id']], res)
    y_pred.append(res)

In [226]:
y_test = X_test['quantity'].to_numpy()

In [230]:
len(y_test)

5209

In [229]:
len(y_pred)

5209

In [239]:
mean_absolute_error(y_pred, y_test)

1.2132366145364113

In [194]:
exponential_smoothing(data_products[0][71170]['quantity'][:-1], 0.4)

0.01856359036868414