In [134]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import os
import os.path as path
import sklearn.metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mape
from sklearn.metrics import mean_squared_error as mse
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_features, extract_relevant_features, select_features


%matplotlib inline

In [2]:
sns.set(style="whitegrid")
plt.rcParams.update({'figure.figsize': (7, 5), 'figure.dpi': 120})

In [5]:
df = pd.read_csv('../data/raw/stoloto_data.csv',
                       sep=',',
                       parse_dates=['date'], index_col=0)
df.head()

Unnamed: 0,ops_num,game_code,circulation,ufps_num,postamt_num,sales,date
0,117463,7105,216,2,165,57,2017-01-15
2,117218,7105,217,2,165,17,2017-01-22
4,109518,7105,218,2,484,16,2017-01-29
8,119311,7105,209,2,306,22,2016-11-27
13,105043,7105,212,2,224,25,2016-12-18


In [6]:
# planning horizon
ph = 14

In [23]:
timeseries = pd.DataFrame(df.iloc[:, :2])
timeseries['id'] = 0
timeseries["7101_105005"]

0      19.250000
1      19.611111
2      19.000000
3      19.500000
4      20.333333
         ...    
113    21.857143
114    23.000000
115    25.000000
116    20.000000
117    22.666667
Name: 7101_105005, Length: 118, dtype: float64

In [7]:
# timeseries = pd.DataFrame(df.iloc[:, :1])
# timeseries['id'] = 0

extracted_features = extract_features(df, column_id="ops_num", column_kind="game_code", column_value="sales", column_sort="date", impute_function=impute) 
extracted_features.head()

Feature Extraction: 100%|██████████| 10/10 [00:20<00:00,  2.05s/it]


Unnamed: 0,7105__variance_larger_than_standard_deviation,7105__has_duplicate_max,7105__has_duplicate_min,7105__has_duplicate,7105__sum_values,7105__abs_energy,7105__mean_abs_change,7105__mean_change,7105__mean_second_derivative_central,7105__median,...,7103__fourier_entropy__bins_2,7103__fourier_entropy__bins_3,7103__fourier_entropy__bins_5,7103__fourier_entropy__bins_10,7103__fourier_entropy__bins_100,7103__permutation_entropy__dimension_3__tau_1,7103__permutation_entropy__dimension_4__tau_1,7103__permutation_entropy__dimension_5__tau_1,7103__permutation_entropy__dimension_6__tau_1,7103__permutation_entropy__dimension_7__tau_1
103132,,,,,,,,,,,...,,,,,,,,,,
105005,1.0,0.0,0.0,1.0,5324.0,757384.0,29.414414,0.189189,-0.059091,36.0,...,0.372481,0.766798,1.032496,1.655561,3.477232,1.787136,3.105722,4.134775,4.556225,4.650361
105043,1.0,0.0,0.0,1.0,2917.0,153701.0,15.051546,-0.082474,-0.010417,23.0,...,0.617242,0.830027,1.262577,1.867884,3.404463,1.762421,2.99861,4.014683,4.383525,4.549908
105066,1.0,0.0,0.0,1.0,1390.0,53322.0,12.209677,0.048387,0.02459,19.0,...,0.132691,0.354599,0.793748,1.331704,3.144011,1.741195,2.987269,3.923088,4.127134,4.110874
105264,1.0,0.0,1.0,1.0,1960.0,106936.0,12.192308,0.173077,-0.004854,14.0,...,0.593615,0.837499,1.235145,1.822476,3.538273,1.766637,3.083514,4.277879,4.605115,4.672829


In [105]:
df = pd.read_csv('../data/preprocessed/knn_imputated_ts.csv',
                       sep=',',
                       parse_dates=['date'], index_col=0)
df.head()

Unnamed: 0_level_0,7101_105005,7101_105043,7101_105264,7101_105523,7101_105554,7101_105568,7101_107589,7101_109263,7101_109382,7101_109383,...,7175_119311,7175_119619,7175_121374,7175_124460,7175_125222,7175_125239,7175_127204,7175_127276,7175_129110,7175_129164
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-11-06,19.25,23.0,17.0,21.0,28.0,15.0,18.0,13.0,29.222222,17.0,...,14.0,18.0,19.0,15.0,28.0,42.0,15.0,29.0,34.0,17.0
2016-11-13,19.611111,23.0,17.0,21.0,26.814815,18.5,17.777778,13.0,16.0,17.0,...,14.0,18.0,19.0,15.0,28.0,42.0,15.0,29.0,34.0,17.0
2016-11-20,19.0,23.0,17.0,21.0,30.0,18.0,18.0,13.0,33.0,17.0,...,14.0,18.0,19.0,15.0,28.0,42.0,15.0,29.0,34.0,17.0
2016-11-27,19.5,23.0,17.0,21.0,26.0,21.0,18.0,13.0,38.666667,17.0,...,14.0,18.0,19.0,15.0,28.0,42.0,15.0,29.0,34.0,17.0
2016-12-04,20.333333,23.0,16.333333,20.5,24.444444,20.0,17.333333,13.0,16.0,17.0,...,14.0,18.0,19.0,15.0,28.0,42.0,15.0,29.0,34.0,17.0


In [129]:
X = pd.DataFrame(df.iloc[:, 0].shift(14)).dropna()
y = df.iloc[14:, 0]
X['mean_w4'] = X.rolling(4).mean()
X['mean_w8'] = X['7101_105005'].rolling(8).mean()
X['mean_w12'] = X['7101_105005'].rolling(12).mean()
X['deviation'] = X['7101_105005'] - X['7101_105005'].mean()
X['previous'] = X['7101_105005'].shift(1)
X = X.bfill()
X = X.drop(['7101_105005'], axis=1)
X.reset_index(inplace=True)

extracted_features = extract_features(X, column_id='date', column_sort="date", impute_function=impute) 

X_filtered = select_features(extracted_features, y, ml_task='regression')

X_train = extracted_features.iloc[:-14, :]
X_test = extracted_features.iloc[-14:, :]
y_train = y[:-14]
y_test = y[-14:]

classifier_filtered = DecisionTreeRegressor()
classifier_filtered.fit(X_train, y_train)
print(classification_report(y_test, classifier_filtered.predict(X_test)))

Feature Extraction: 100%|██████████| 10/10 [00:14<00:00,  1.45s/it]


ValueError: Unknown label type: 'continuous'

In [140]:
classifier_filtered = DecisionTreeRegressor()
classifier_filtered.fit(X_train, y_train)
#pred = classifier_filtered.predict(X_test)
#print(type(pred))
print(mse(y_test.to_numpy(), classifier_filtered.predict(X_test), squared=False))

7.571485153598529


In [138]:
print(y_test.to_numpy())

[17.         15.         20.         15.         19.         15.
 16.         22.         32.         21.85714286 23.         25.
 20.         22.66666667]


In [139]:

print(pred)

[19. 19. 19. 19. 19. 19. 15. 15. 15. 13. 17. 15. 20. 16.]


In [131]:
y_train = y[:-14]
print(type(y_train))

<class 'pandas.core.series.Series'>


In [117]:
X = pd.DataFrame(df.iloc[:, 0].shift(14))
print(X)

            7101_105005
date                   
2016-11-06          NaN
2016-11-13          NaN
2016-11-20          NaN
2016-11-27          NaN
2016-12-04          NaN
...                 ...
2019-01-06         21.0
2019-01-13         18.0
2019-01-20         18.0
2019-01-27         16.0
2019-02-03         21.0

[118 rows x 1 columns]


In [121]:

print(X_train)

Empty DataFrame
Columns: []
Index: [2017-02-12 00:00:00, 2017-02-19 00:00:00, 2017-02-26 00:00:00, 2017-03-05 00:00:00, 2017-03-12 00:00:00, 2017-03-19 00:00:00, 2017-03-26 00:00:00, 2017-04-02 00:00:00, 2017-04-09 00:00:00, 2017-04-16 00:00:00, 2017-04-23 00:00:00, 2017-04-30 00:00:00, 2017-05-07 00:00:00, 2017-05-14 00:00:00, 2017-05-21 00:00:00, 2017-05-28 00:00:00, 2017-06-04 00:00:00, 2017-06-11 00:00:00, 2017-06-18 00:00:00, 2017-06-25 00:00:00, 2017-07-02 00:00:00, 2017-07-09 00:00:00, 2017-07-16 00:00:00, 2017-07-23 00:00:00, 2017-07-30 00:00:00, 2017-08-06 00:00:00, 2017-08-13 00:00:00, 2017-08-20 00:00:00, 2017-08-27 00:00:00, 2017-09-03 00:00:00, 2017-09-10 00:00:00, 2017-09-17 00:00:00, 2017-09-24 00:00:00, 2017-10-01 00:00:00, 2017-10-08 00:00:00, 2017-10-15 00:00:00, 2017-10-22 00:00:00, 2017-10-29 00:00:00, 2017-11-05 00:00:00, 2017-11-12 00:00:00, 2017-11-19 00:00:00, 2017-11-26 00:00:00, 2017-12-03 00:00:00, 2017-12-10 00:00:00, 2017-12-17 00:00:00, 2017-12-24 00:00:00

In [65]:
y_test = df.iloc[-14:, 0]
print(y_test)

date
2018-11-04    17.000000
2018-11-11    15.000000
2018-11-18    20.000000
2018-11-25    15.000000
2018-12-02    19.000000
2018-12-09    15.000000
2018-12-16    16.000000
2018-12-23    22.000000
2018-12-30    32.000000
2019-01-06    21.857143
2019-01-13    23.000000
2019-01-20    25.000000
2019-01-27    20.000000
2019-02-03    22.666667
Name: 7101_105005, dtype: float64


In [66]:
from tsfresh.examples import robot_execution_failures

robot_execution_failures.download_robot_execution_failures()
df1, y = robot_execution_failures.load_robot_execution_failures()
print(y)

1      True
2      True
3      True
4      True
5      True
      ...  
84    False
85    False
86    False
87    False
88    False
Length: 88, dtype: bool


In [68]:
df1.shape

(1320, 8)

In [70]:
df1.head()

Unnamed: 0,id,time,F_x,F_y,F_z,T_x,T_y,T_z
0,1,0,-1,-1,63,-3,-1,0
1,1,1,0,0,62,-3,-1,0
2,1,2,-1,-1,61,-3,0,0
3,1,3,-1,-1,63,-2,-1,0
4,1,4,-1,-1,63,-3,-1,0


In [71]:
df1.id.value_counts()

88    15
87    15
24    15
25    15
26    15
      ..
61    15
62    15
63    15
64    15
1     15
Name: id, Length: 88, dtype: int64