In [1]:
from modules import transformers
from modules.subset_extraction import ExpandedWindowIterator, EntityIterator, SampleIterator

import pandas as pd
import numpy as np

In [2]:
import os
from typing import Iterable
import pandas as pd

class DatasetProvider:
    def __init__(self, file_formats: Iterable[str]=['csv']):
        self.cwd = os.getcwd()
        self.file_list = []
        for dirname, _, filenames in os.walk(self.cwd):
            for filename in filenames:
                if filename.split('.')[-1] in file_formats:
                    self.file_list.append(os.path.join(dirname, filename))
        return None

    def get_dataset(self, path: str=None):
        if path == None:
            self._frames = {}
            for file in self.file_list:
                _ds_name = file.split('/')[-1]
                self._frames[_ds_name] = pd.read_csv(file)
        else:
            self._frames = pd.read_csv(path)
        return self._frames


class DatasetUploader:
    def __init__(self, path) -> None:
        self._path = path

    def save(self, dataset: pd.DataFrame, filename: str) -> None:
        try:
            with open(os.path.join(self._path, filename), 'w+') as writer:
                dataset.to_csv(writer)
        except:
            raise

In [3]:
provider = DatasetProvider()

In [4]:
dataset_paths = [
    '/home/denissimo/Repo/fs_project/datasets/sample_submission.csv',
    '/home/denissimo/Repo/fs_project/datasets/test.csv',
    '/home/denissimo/Repo/fs_project/datasets/shops.csv',
    '/home/denissimo/Repo/fs_project/datasets/item_categories.csv',
    '/home/denissimo/Repo/fs_project/datasets/sales_train.csv',
    '/home/denissimo/Repo/fs_project/datasets/items.csv'
]

In [5]:
provider.file_list = dataset_paths

datasets = provider.get_dataset()

In [6]:
datasets.keys()

dict_keys(['sample_submission.csv', 'test.csv', 'shops.csv', 'item_categories.csv', 'sales_train.csv', 'items.csv'])

In [7]:
sales = datasets['sales_train.csv']
items = datasets['items.csv']
categories = datasets['item_categories.csv']
shops = datasets['shops.csv']

In [8]:
id_map = sales[['item_id', 'shop_id']] \
    .value_counts() \
    .reset_index() \
    .sort_values(['item_id', 'shop_id']) \
    .reset_index() \
    .drop(0, axis=1) \
    .rename(columns={'index': 'id'})

In [9]:
id_map

Unnamed: 0,id,item_id,shop_id
0,297282,0,54
1,123367,1,55
2,258299,2,54
3,258298,3,54
4,359057,4,54
...,...,...,...
424119,258296,22168,12
424120,307028,22168,16
424121,307027,22168,42
424122,307026,22168,43


In [10]:
sales.merge(id_map)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,id
0,02.01.2013,0,59,22154,999.00,1.0,307100
1,03.01.2013,0,25,2552,899.00,1.0,265625
2,05.01.2013,0,25,2552,899.00,-1.0,265625
3,06.01.2013,0,25,2554,1709.05,1.0,355658
4,15.01.2013,0,25,2555,1099.00,1.0,72451
...,...,...,...,...,...,...,...
2935844,22.10.2015,33,25,7327,349.00,1.0,247014
2935845,24.10.2015,33,25,7315,399.00,1.0,329059
2935846,31.10.2015,33,25,7409,299.00,1.0,212603
2935847,09.10.2015,33,25,7409,299.00,1.0,212603


In [11]:
transformers.aggregate(
    sales, 
    [
        'item_id', 'shop_id', 'date_block_num'
    ], 
    'item_cnt_day', 
    [
        np.sum, np.std, np.mean, np.median
    ]
)['mean'].idxmax()

(20949, 12, 24)

In [12]:
big_df = sales.copy()

In [13]:
full_monthly_dataset = big_df.groupby(['shop_id', 'item_id', 'date_block_num']) \
.item_cnt_day \
.sum() \
.unstack() \
.fillna(0) \
.stack() \
.reset_index() \
.rename(columns={0:'item_cnt_month'}) \
.merge(id_map) \
.astype({
    'shop_id': 'int8',
    'shop_id': 'int16',
    'date_block_num': 'int8',
    'item_cnt_month': 'float32',
    'id': 'int32',
})

In [14]:
full_monthly_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14420216 entries, 0 to 14420215
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   shop_id         int16  
 1   item_id         int64  
 2   date_block_num  int8   
 3   item_cnt_month  float32
 4   id              int32  
dtypes: float32(1), int16(1), int32(1), int64(1), int8(1)
memory usage: 371.3 MB


In [15]:
def target_extractor(month):
    return full_monthly_dataset[full_monthly_dataset.date_block_num == month][['id', 'item_cnt_month']].set_index('id').sort_index()

In [16]:
iteration_rule = ExpandedWindowIterator('date_block_num', target_extractor, min_idx=24, step=1, max_idx=32)

In [17]:
subset_extractor = SampleIterator('id', frac=0.05)

In [18]:
#  for window, split_x, split_y in iteration_rule(full_monthly_dataset):
for _id, subset in subset_extractor(full_monthly_dataset):
    print(_id)

396409
245562
227392
366802
60222
156423
57034
286537
80817
313875
255473
162931
138616
381110
392138
154130
140399
128014
152143
347236
359652
422465
161343
317440
411376
157028
196207
86949
149908
150999
8738
77824
264444
316936
39249
191118
259144
38752
51243
246102
334044
204194
147778
73900
255241
323967
400237
169691
289458
327758
218153
145149
226348
123568
320174
346553
214395
196174
367301
190528
173937
188105
67520
257964
103241
178538
57897
128378
199839
412148
3629
94406
226005
237957
392506
31987
120874
6626
288311
300243
343113
117880
288399
81128
214305
71544
73515
378181
223877
304123
141498
59403
344210
282274
45758
136872
57050
378901
49035
16242
300436
247117
413251
224732
329887
347937
35186
152572
381592
310135
334174
63393
376508
41582
259278
307555
62847
168263
170422
326520
144142
225243
15484
26829
165708
236997
177005
287111
233207
423351
8165
75588
401772
8306
403528
309241
125688
29490
170413
29719
162993
28462
343567
385705
328697
281292
223409
45859
120720

KeyboardInterrupt: 