In [6]:
## Version futuresales lib required = 0.1.8

# !pip install -i https://test.pypi.org/simple/ futuresales_denissimo==0.1.8

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm._tqdm_notebook import tqdm_notebook
from tqdm._tqdm_notebook import tqdm
tqdm_notebook.pandas()

import pickle

import futuresales as fs

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [8]:
from futuresales.utils import load_credentials
import neptune.new as neptune

cred = load_credentials('../credentials.json')['projects']['fs-feature-space']

run = neptune.init(
    api_token=cred['api_token'],
    project=cred['project'],
    tags=['extended', 'baseline']
)  # your credentials


Info (NVML): Driver Not Loaded. GPU usage metrics may not be reported. For more information, see https://docs-legacy.neptune.ai/logging-and-managing-experiment-results/logging-experiment-data.html#hardware-consumption 


https://app.neptune.ai/denissimo/fs-feature-space/e/FSFEAT-4
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [9]:
CONFIG = {
    'FETCH_DATA': True
}

In [10]:
run['dataset_dependecy'] = {
    'original': 'kaggle competitions download -c competitive-data-science-predict-future-sales',
}

In [11]:
df_provider = fs.distribution.DatasetProvider()

df_provider.file_list = [
    '/home/denissimo/Repo/fs_project/datasets/sample_submission.csv',
    '/home/denissimo/Repo/fs_project/datasets/test.csv',
    '/home/denissimo/Repo/fs_project/datasets/shops.csv',
    '/home/denissimo/Repo/fs_project/datasets/item_categories.csv',
    '/home/denissimo/Repo/fs_project/datasets/sales_train.csv',
    '/home/denissimo/Repo/fs_project/datasets/items.csv'
]

datasets = df_provider.get_dataset()

In [12]:
sales = datasets['sales_train.csv']
items = datasets['items.csv']
categories = datasets['item_categories.csv']
shops = datasets['shops.csv']

sales.date = sales.date.astype('datetime64[ns]')


from datetime import date
sales = sales.loc[sales.date < np.datetime64(date(2015, 11, 1))]

print("Before:", sales.shape)

sales_train = sales[
    (sales["item_cnt_day"] < 550)
    & (sales["item_price"] > 0)
    & (sales["item_price"] < 60000)
].copy()
print("After:", sales_train.shape)

sales.head()

Before: (2896782, 6)
After: (2896775, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0
2,2013-05-01,0,25,2552,899.0,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [13]:
from extraction_configs.pipeline_configs import make_indexes, make_task_df
from extraction_configs.generators_config import make_baseline_train_generator
from extraction_configs.aggregation_configs import make_baseline_train_aggregation, make_baseline_submission_aggregation
from futuresales.distribution import to_pickle, from_pickle
from futuresales.features import FeatureExtractor

pipelines = {
    'idx': make_indexes(shops, categories),
}

task_df = {}
task_df['idx'] = pipelines['idx']['id_merging_stage'](items)

task_df['idx'].to_csv('../tmp/idx.csv')

pipelines['task_df'] = make_task_df(items, shops, categories, task_df['idx'])

In [14]:
from futuresales.pipeline import Pipeline

pipeline = Pipeline(
    tasks=pipelines['task_df'], 
    task_queue = [
        'id_merging_stage',
        'summarizing_and_name_merging_stage',
        'date_block_num_renaming',
        'object_id_encoding',
    ]
)

pipeline_test = pipeline(sales)
pipeline_train = pipeline(sales_train)

if not CONFIG['FETCH_DATA']:
    _ = pipeline_test.proceed_all()
    _ = pipeline_train.proceed_all()
    task_df['test'] = pipeline_test.result_storage['object_id_encoding']
    task_df['train'] = pipeline_train.result_storage['object_id_encoding']
    task_df['test'].to_csv('../tmp/task_df_test.csv')
    task_df['train'].to_csv('../tmp/task_df_train.csv')
else:
    task_df['test'] = pd.read_csv('../tmp/task_df_test.csv')
    task_df['train'] = pd.read_csv('../tmp/task_df_train.csv')

In [15]:
def dyn_agg(months):
    return {f'dynamic_aggregation_{k}_win_{months}':f'dynamic_aggregation_{k}_all' for k in ['mean', 'std', 'min', 'max']}

baseline = {}

file_map = {
    30: 'train_set',
    31: 'test_set',
    32: 'validation_set',
}

train_set = None

for train_max in range(19, 31):
    print(f'\nProcessing train set month - {train_max}')
    baseline_extractor = FeatureExtractor(
        make_baseline_train_generator(train_max), 
        make_baseline_train_aggregation(train_max), 
        ['id'])
    baseline[train_max] = baseline_extractor({
        'original': task_df['train'],
        'test': task_df['test']
        })
    features = baseline[train_max]['features']
    for key in features:
        if not key in ['valid_target', 'target', 'lags']:
            features[key].columns = [
                f'{key}_{agg_type}' for agg_type in features[key].columns.values
                ]
    features = pd.concat(
        features.values(), axis=1, join='inner')
    features = features.rename(dyn_agg(train_max + 1), axis=1)
    features['month'] = train_max % 12
    features['num_month'] = train_max
    if train_set is None:
        train_set = features
    else:
        train_set = pd.concat([train_set, features], axis=0)
    from gc import collect
    collect()

train_set.to_csv(f'../tmp/ext_baseline_{file_map[30]}.csv')


Processing train set month - 19
Subseries stage: id_sales
Subseries stage: id_sales_test
Subseries stage: train_series
Subseries stage: diff_1
Subseries stage: diff_2
Subseries stage: lags
Subseries stage: lags_12

Processing train set month - 20
Subseries stage: id_sales
Subseries stage: id_sales_test
Subseries stage: train_series
Subseries stage: diff_1
Subseries stage: diff_2
Subseries stage: lags
Subseries stage: lags_12

Processing train set month - 21
Subseries stage: id_sales
Subseries stage: id_sales_test
Subseries stage: train_series
Subseries stage: diff_1
Subseries stage: diff_2
Subseries stage: lags
Subseries stage: lags_12

Processing train set month - 22
Subseries stage: id_sales
Subseries stage: id_sales_test
Subseries stage: train_series
Subseries stage: diff_1
Subseries stage: diff_2
Subseries stage: lags
Subseries stage: lags_12

Processing train set month - 23
Subseries stage: id_sales
Subseries stage: id_sales_test
Subseries stage: train_series
Subseries stage: dif

In [1]:
baseline

NameError: name 'baseline' is not defined

In [None]:
for train_max in range(31, 33):
    print(f'\nProcessing set - {file_map[train_max]}')
    baseline_extractor = FeatureExtractor(
        make_baseline_train_generator(train_max), 
        make_baseline_train_aggregation(train_max), 
        ['id'])
    baseline[file_map[train_max]] = baseline_extractor({
        'original': task_df['train'],
        'test': task_df['test']
        })
    features = baseline[file_map[train_max]]['features']
    for key in features:
        if not key in ['valid_target', 'target', 'lags']:
            features[key].columns = [
                f'{key}_{agg_type}' for agg_type in features[key].columns.values
                ]
    features = pd.concat(
        features.values(), axis=1, join='inner')
    features = features.rename(dyn_agg(train_max + 1), axis=1)
    features['month'] = train_max % 12
    features['num_month'] = train_max
    features.to_csv(f'../tmp/ext_baseline_{file_map[train_max]}.csv')

NameError: name 'file_map' is not defined

In [None]:
feature_info = {
    key: baseline[file_map[30]]['features'][key].columns.to_list() for key in baseline[file_map[30]]['features'].keys()
}

In [None]:
run['features'] = feature_info
run['extraction_configs'].upload_files([
    '../featuring/extraction_configs/aggregation_configs.py',
    '../featuring/extraction_configs/pipeline_configs.py',
    '../featuring/extraction_configs/generators_config.py',
    ])

In [None]:
run['extraction_notebook'].upload_files('../featuring/baseline_extraction.ipynb')

In [None]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.


All 1 operations synced, thanks for waiting!
