Adapted from: https://www.kaggle.com/ragnar123/very-fst-model

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder

In [None]:
from project_lib import Project
project = Project.access()

### Read input data

In [None]:
sales_train_validation = pd.read_csv('/project_data/data_asset/sales_train_validation.csv')
calendar = pd.read_csv('/project_data/data_asset/calendar.csv')
sell_prices = pd.read_csv('/project_data/data_asset/sell_prices.csv')
submission = pd.read_csv('/project_data/data_asset/sample_submission.csv')

In [None]:
products = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()

### Reduce memory usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
sales_train_validation = reduce_mem_usage(sales_train_validation)
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)
submission = reduce_mem_usage(submission)
products = reduce_mem_usage(products)
gc.collect()

### Define data preparation pipeline

In [None]:
def melt_sales(df):
    return pd.melt(frame = df, 
                   id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                   var_name = 'day', 
                   value_name = 'demand')

In [None]:
def merge_with_calendar(df, calendar):
    df_with_cal = pd.merge(df, calendar, how = "left", left_on = ["day"], right_on = ["d"])
    df_with_cal.drop(['d', 'day'], inplace = True, axis = 1)
    return df_with_cal

In [None]:
def merge_with_prices(df, sell_prices):
    df_with_prices = df.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
    return df_with_prices

In [None]:
def fill_na_feats(df, nan_features):
    for feature in nan_features:
        df[feature].fillna('unknown', inplace = True)

Ideas:

* 
*



In [None]:
def feature_engineering(data):
    
    # demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # event features
    data["event_name_1"] = 1 - data["event_name_1"].isna().astype(int)
    data["event_name_2"] = 1 - data["event_name_2"].isna().astype(int)
    data.rename(columns={"event_name_1": "is_event_1", 
                         "event_name_2": "is_event_2"})
    return data

In [None]:
class CategoricalEncoder:

    def __init__(self, cat_columns):
        self.encoder_dict = {}
        self.cat_columns = cat_columns
        self.is_encoded = False

    def encode(self, df):
        for column in cat_columns:
            encoder = LabelEncoder()
            df[column] = encoder.fit_transform(df[column])
            self.encoder_dict[column] = encoder
        self.is_encoded = True
            
    def decode(self, df):
        for column in self.cat_columns:
            encoder = self.encoder_dict[column]
            df[column] = encoder.inverse_transform(df[column])
            
    def is_encoded(self):
        return self.is_encoded

In [None]:
def prepare_data(df):
    train_df = merge_with_calendar(df, calendar)
    gc.collect()
    train_df = merge_with_prices(train_df, sell_prices)
    gc.collect()
    fill_na_feats(train_df, ['event_type_1', 'event_type_2'])
    train_df = feature_engineering(train_df)
    return train_df

### Split validation and evaluation submissions

In [None]:
validation_rows = [row for row in submission['id'] if 'validation' in row]
evaluation_rows = [row for row in submission['id'] if 'evaluation' in row]
submission_validation = submission.loc[submission['id'].isin(validation_rows)]
submission_evaluation = submission.loc[submission['id'].isin(evaluation_rows)]

submission_validation.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921',
                                 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 
                                 'd_1931', 'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 
                                 'd_1940', 'd_1941']
submission_evaluation.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949',
                                 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 
                                 'd_1959', 'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 
                                 'd_1968', 'd_1969']

del submission
gc.collect()

In [None]:
submission_validation = submission_validation.merge(products, how = 'right', on = 'id')
submission_evaluation = submission_evaluation.merge(products, how = 'right', on = 'id')
gc.collect()

### Apply data preparation pipeline to full dataset

In [None]:
sales_train_validation = melt_sales(sales_train_validation)
submission_validation = melt_sales(submission_validation)
submission_evaluation = melt_sales(submission_evaluation)

In [None]:
sales_train_validation['part'] = 'train'
submission_validation['part'] = 'test1'
submission_evaluation['part'] = 'test2'
    
#data = pd.concat([sales_train_validation, submission_validation], axis = 0)
data = pd.concat([sales_train_validation, submission_validation, submission_evaluation], axis = 0)

In [None]:
data = prepare_data(data)

In [None]:
# Encode categorical features
cat_columns = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_type_1', 'event_type_2']
encoder = CategoricalEncoder(cat_columns)
encoder.encode(data)

In [None]:
drop_columns = ["weekday"]
data.drop(drop_columns, inplace = True, axis = 1)

In [None]:
data.to_pickle("/home/wsuser/work/project_data_assets/data_asset/full_data.pkl")

## Denoising techniques

#### Wavelet denoising