In [22]:
from lib import *

Pipeline = pipeline.Pipeline
RegressionValidator = pipeline.RegressionValidator

FeatureExtractor = extractor.FeatureExtractor

DatasetProvider = provider.DatasetProvider
DatasetUploader = provider.DatasetUploader
ExpandedWindowIterator = subset_extraction.ExpandedWindowIterator
EntityIterator = subset_extraction.EntityIterator

In [23]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm._tqdm_notebook import tqdm_notebook
from tqdm._tqdm_notebook import tqdm
tqdm_notebook.pandas()

import pickle


pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline


tr = transformers
agg = aggregators

In [24]:
CONFIG = {
    'FETCH_DATA': True,
}

In [25]:
df_provider = DatasetProvider()

In [26]:
dataset_paths = [
    '/home/denissimo/Repo/fs_project/datasets/sample_submission.csv',
    '/home/denissimo/Repo/fs_project/datasets/test.csv',
    '/home/denissimo/Repo/fs_project/datasets/shops.csv',
    '/home/denissimo/Repo/fs_project/datasets/item_categories.csv',
    '/home/denissimo/Repo/fs_project/datasets/sales_train.csv',
    '/home/denissimo/Repo/fs_project/datasets/items.csv'
]

In [27]:
df_provider.file_list = dataset_paths

datasets = df_provider.get_dataset()

In [28]:
sales = datasets['sales_train.csv']
items = datasets['items.csv']
categories = datasets['item_categories.csv']
shops = datasets['shops.csv']

In [29]:
sales.date = sales.date.astype('datetime64[ns]')

print("Before:", sales.shape)

from datetime import date

sales = sales.loc[sales.date < np.datetime64(date(2015, 11, 1))]

sales_train = sales[
    (sales["item_cnt_day"] < 1000)
    & (sales["item_price"] > 0)
    & (sales["item_price"] < 60000)
].copy()
print("After:", sales_train.shape)

sales.head()

Before: (2935849, 6)
After: (2896778, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0
2,2013-05-01,0,25,2552,899.0,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [30]:
import pickle

def load_pickle(dataset, name):
    dataset.to_pickle(name + '.pkl')

def from_pickle(path):
    with open(path, 'rb') as reader:
        return pickle.load(reader)

In [31]:
index_preprocessing = {}

index_preprocessing['id_merging_stage'] = lambda dataset: dataset.merge(
    shops,
    how='cross'
).merge(
    categories,
    on='item_category_id'
).reset_index().rename({'index': 'id'}, axis=1)



In [32]:
task_df = {}
task_df['idx'] = index_preprocessing['id_merging_stage'](items)

In [33]:
data_preprocessing = {}

# Add corresponding category and shop id's to each sale
data_preprocessing['id_merging_stage'] = lambda dataset: dataset.merge(
    items, 
    on='item_id'
).merge(
    shops,
    on='shop_id'
).merge(
    categories,
    on='item_category_id'
)

# Add summary among shop_id and category_id above similar time periods (daily intervals)
data_preprocessing['summarizing_and_name_merging_stage'] = lambda dataset: dataset.groupby(
    ['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id']
).agg({'item_cnt_day': np.sum, 'item_price': np.mean}).reset_index().sort_values('date')

data_preprocessing['date_block_num_renaming'] = lambda dataset: dataset.rename(columns={'date_block_num': 'month_block'}, inplace=True)

data_preprocessing['object_id_encoding'] = lambda dataset: dataset.merge(
    task_df['idx'][['id', 'shop_id', 'item_id']], 
    on=['shop_id', 'item_id'])

In [34]:
pipeline = Pipeline(
    tasks=data_preprocessing, 
    task_queue = [
        'id_merging_stage',
        'summarizing_and_name_merging_stage',
        'date_block_num_renaming',
        'object_id_encoding',
    ]
)

In [35]:
pipeline_test = pipeline(sales)
pipeline_train = pipeline(sales_train)

if not CONFIG['FETCH_DATA']:
    _ = pipeline_test.proceed_all()
    _ = pipeline_train.proceed_all()

In [36]:
if CONFIG['FETCH_DATA'] == False:
    task_df['test'] = pipeline_test.result_storage['object_id_encoding']
    task_df['train'] = pipeline_train.result_storage['object_id_encoding']
    load_pickle(task_df['test'], 'tmp/task_df_test')
    load_pickle(task_df['train'], 'tmp/task_df_train')
else:
    task_df['test'] = from_pickle('tmp/task_df_test.pkl')
    task_df['train'] = from_pickle('tmp/task_df_train.pkl')

## Import necessairy methods for extraction

In [37]:
# transformers.py
diff = tr.diff
make_transformer = tr.make_transformer

# aggregators.py
take_subseries = agg.take_subseries
reorder_columns = agg.reorder_columns
aggregate_window_serieses = agg.aggregate_window_serieses
create_aggregation_pipeline = agg.create_aggregation_pipeline

## Configuration

In [38]:
AGGREGATION_CFG = {
    'target': {
        'from': ['id_sales'],
        'func_name': take_subseries,
        'params': {
            'columns': slice(-2, -1),
            'new_name': ['target'],
        },
    },
    'lags': {
        'from': ['lags'],
        'func_name': take_subseries,
        'params': {
            'columns': [f'lag_{i}' for i in [1, 2, 3, 4, 6, 12]],
            'new_name': [f'lag_{i}' for i in [1, 2, 3, 4, 6, 12]],
        },
    },
    'acf_lags': {
        'from': ['lags_acf'],
        'func_name': take_subseries,
        'params': {
            'columns': [f'acf_lag_{i}' for i in [1, 2, 3, 4, 6, 12]],
            'new_name': [f'acf_lag_{i}' for i in [1, 2, 3, 4, 6, 12]],
        },
    },
    'dynamic_aggregation': {
        'from': ['train_series'],
        'func_name': aggregate_window_serieses,
        'params': {
            'funcs': [
                'mean', 
                'std',
                'min',
                'max',
                make_transformer(np.percentile, q=20),
                make_transformer(np.percentile, q=80),
                ],
            'windows': [2, 4, 6, 12],
            'func_names': [
                'mean',
                'std',
                'min',
                'max',
                'percentile_20',
                'percentile_80',
                ]
        },
    },
    'diff_1_aggregation': {
        'from': ['diff_1'],
        'func_name': aggregate_window_serieses,
        'params': {
            'funcs': [
                'mean', 
                'std',
                'min',
                'max',
                ],
            'windows': [2, 4, 6, 12],
            'func_names': [
                'mean',
                'std',
                'min',
                'max',
                ]
        },
    },
    'diff_2_aggregation': {
        'from': ['diff_2'],
        'func_name': aggregate_window_serieses,
        'params': {
            'funcs': [
                'mean', 
                'std',
                'min',
                'max',
                ],
            'windows': [2, 4, 6, 12],
            'func_names': [
                'mean',
                'std',
                'min',
                'max',
                ]
        },
    },
}

In [39]:
GENERATOIN_CFG = {
    'id_sales': {
        'series_order': [],
        'func_name': from_pickle,
        'params': {
            'path': 'tmp/id_sales.pkl'
        }
    },
    'train_series': {
        'series_order': ['id_sales'],
        'func_name': take_subseries,
        'params': { 
            'columns': slice(None,-2), 
            'new_name': [i for i in range(32)]
        }
    },
    'diff_1': {
        'series_order': ['train_series'], 
        'func_name': diff,
        'params': {
            'order': 1
        }
    },
    'diff_2': {
        'series_order': ['train_series'], 
        'func_name': diff,
        'params': {
            'order': 2
        }
    },
    'acf': {
        'series_order': [],
        'func_name': from_pickle,
        'params': {
            'path': 'tmp/acf.pkl'
        }
    },
    'lags': {
        'series_order': ['train_series'],
        'func_name': create_aggregation_pipeline,
        'params': {
            'func_queue': [
                make_transformer(
                    take_subseries, 
                    columns=slice(-13, -1), 
                    new_name=[f'lag_{13 - i}' for i in range(1, 13)]),
                make_transformer(
                    reorder_columns,
                    new_order=[f'lag_{i}' for i in range(1, 13)])
            ]
        }
    },
    'lags_12': {
        'series_order': ['train_series'],
        'func_name': create_aggregation_pipeline,
        'params': {
            'func_queue': [
                make_transformer(
                    take_subseries, 
                    columns=slice(-25, -13), 
                    new_name=[f'lag_12_{13 - i}' for i in range(1, 13)]),
                make_transformer(
                    reorder_columns,
                    new_order=[f'lag_12_{i}' for i in range(1, 13)])
            ]
        }
    },
    'lags_acf_raw': {
        'series_order': ['lags', 'acf'],
        'func_name': np.multiply,
        'params': {}
    },
    'lags_acf': {
        'series_order': ['lags_acf_raw'],
        'func_name': take_subseries,
        'params': {
            'columns':slice(None, None, None), 
            'new_name':[f'acf_lag_{i}' for i in range(1, 13)],
        }
    },
}

In [40]:
baseline_extractor = FeatureExtractor(GENERATOIN_CFG, AGGREGATION_CFG, ['id'])

baseline_train = baseline_extractor({
    'original': task_df['train']
})

  return func(*frames, **kwargs)


In [41]:
with open('tmp/train_extracted.pkl', 'wb+') as writer:
    pickle.dump(baseline_train['features'], writer)

In [42]:
serieses = task_df['train'].groupby(['month_block', 'id']).item_cnt_day.sum().reset_index().set_index('id').index.value_counts()