In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit,StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold,train_test_split,GroupShuffleSplit,StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error,log_loss,confusion_matrix
import sqlite3
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
#from bayes_opt import BayesianOptimization
import re
from string import punctuation
from scipy.spatial import Voronoi
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
from tqdm.notebook import tqdm
from numba import jit
from collections import Counter
import json
import joblib
import multiprocessing
import time

In [3]:
c = pd.DataFrame()

In [None]:
c.drop_duplicates()

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int32', 'int64', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
sample_submission = pd.read_csv('../data/sample_submission.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')
sales_train = pd.read_csv('../data/sales_train_validation.csv')
calendar = pd.read_csv('../data/calendar.csv')

In [4]:
####扩展sales_train df方便后续
#sales_train = pd.read_csv('../data/sales_train_validation.csv')
for _ in [f'd_{i}' for i in range(1914,1970)]:
    sales_train[_] = np.nan

In [5]:
sales_train_long_format = pd.melt(sales_train,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name = 'day_num',value_name='sale')

In [6]:
def transform_day_to_num(str1):
    return int(str1[2:])
sales_train_long_format['day_num'] = sales_train_long_format['day_num'].map(transform_day_to_num)
calendar['date'] = pd.to_datetime(calendar['date'])
calendar['day_num'] = calendar['d'].map(transform_day_to_num)
map_day_date = calendar[['date','day_num']].set_index('day_num')['date']
sales_train_long_format['date'] = sales_train_long_format['day_num'].map(map_day_date)
list1 = ['wm_yr_wk','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI','day_num']
sales_train_long_format = sales_train_long_format.merge(calendar[list1],on='day_num',how='left')

In [7]:
sales_train_long_format = sales_train_long_format.merge(sell_prices,how='left',on = ['store_id','item_id','wm_yr_wk'])
###把没卖的变成nan
sales_train_long_format.loc[pd.isna(sales_train_long_format.sell_price),'sale'] = np.nan
sales_train_long_format = reduce_mem_usage(sales_train_long_format)

Mem. usage decreased to 6526.92 Mb (25.0% reduction)


In [8]:
truncated_data = sales_train_long_format.copy()

### part1

In [9]:
#### lag系列
def lag_sale_1(x):
    return x.shift(1)
def lag_sale_2(x):
    return x.shift(2)
def lag_sale_3(x):
    return x.shift(3)
def lag_sale_4(x):
    return x.shift(4)
def lag_sale_5(x):
    return x.shift(5)
def lag_sale_6(x):
    return x.shift(6)
def lag_sale_7(x):
    return x.shift(7)
#### 
def rolling_sale_3(x):
    return x.shift(1).rolling(3,3).agg(['mean'])
def rolling_sale_7(x):
    return x.shift(1).rolling(7,int(7/2)).agg(['mean','std','median'])
def rolling_sale_14(x):
    return x.shift(1).rolling(14,int(28/2)).agg(['mean','std','median'])
def rolling_sale_28(x):
    return x.shift(1).rolling(28,int(28/2)).agg(['mean','std','median'])
def rolling_sale_91(x):
    return x.shift(1).rolling(91,int(91/2)).agg(['mean','std','median'])
def rolling_sale_182(x):
    return x.shift(1).rolling(182,50).agg(['mean','std','median'])
def rolling_sale_364(x):
    return x.shift(1).rolling(364,50).agg(['mean','std','median'])


def rolling_sale_7_quantile25(x):
    return x.shift(1).rolling(7,int(7/2)).quantile(0.25)
def rolling_sale_14_quantile25(x):
    return x.shift(1).rolling(14,int(14/2)).quantile(0.25)
def rolling_sale_28_quantile25(x):
    return x.shift(1).rolling(28,int(28/2)).quantile(0.25)
def rolling_sale_91_quantile25(x):
    return x.shift(1).rolling(91,int(91/2)).quantile(0.25)
def rolling_sale_182_quantile25(x):
    return x.shift(1).rolling(182,50).quantile(0.25)
def rolling_sale_364_quantile25(x):
    return x.shift(1).rolling(364,50).quantile(0.25)

def rolling_sale_7_quantile75(x):
    return x.shift(1).rolling(7,int(7/2)).quantile(0.75)
def rolling_sale_14_quantile75(x):
    return x.shift(1).rolling(14,int(14/2)).quantile(0.75)
def rolling_sale_28_quantile75(x):
    return x.shift(1).rolling(28,int(28/2)).quantile(0.75)
def rolling_sale_91_quantile75(x):
    return x.shift(1).rolling(91,int(91/2)).quantile(0.75)
def rolling_sale_182_quantile75(x):
    return x.shift(1).rolling(182,50).quantile(0.75)
def rolling_sale_364_quantile75(x):
    return x.shift(1).rolling(364,50).quantile(0.75)
####
def rolling_sale_dayofweek_2(x):
    return x.shift(1).rolling(2,1).agg(['mean'])
def rolling_sale_dayofweek_4(x):
    return x.shift(1).rolling(4,1).agg(['mean','std'])
def rolling_sale_dayofweek_8(x):
    return x.shift(1).rolling(8,2).agg(['mean','std','median'])
def rolling_sale_dayofweek_12(x):
    return x.shift(1).rolling(12,4).agg(['mean','std','median'])
def rolling_sale_dayofweek_24(x):
    return x.shift(1).rolling(24,8).agg(['mean','std','median'])
def rolling_sale_dayofweek_52(x):
    return x.shift(1).rolling(52,8).agg(['mean','std','median'])

def rolling_sale_dayofweek_8_quantile25(x):
    return x.shift(1).rolling(8,2).quantile(0.25)
def rolling_sale_dayofweek_12_quantile25(x):
    return x.shift(1).rolling(12,4).quantile(0.25)
def rolling_sale_dayofweek_24_quantile25(x):
    return x.shift(1).rolling(24,8).quantile(0.25)
def rolling_sale_dayofweek_52_quantile25(x):
    return x.shift(1).rolling(52,16).quantile(0.25)

def rolling_sale_dayofweek_8_quantile75(x):
    return x.shift(1).rolling(8,2).quantile(0.75)
def rolling_sale_dayofweek_12_quantile75(x):
    return x.shift(1).rolling(12,4).quantile(0.75)
def rolling_sale_dayofweek_24_quantile75(x):
    return x.shift(1).rolling(24,8).quantile(0.75)
def rolling_sale_dayofweek_52_quantile75(x):
    return x.shift(1).rolling(52,16).quantile(0.75)

In [10]:
def parall_group_by_id_sale(df,func,n_cores,name_feature):
    with multiprocessing.Pool(n_cores) as p:
        ret_list = list(tqdm(p.imap(func, [group for name, group in df.groupby(['id'])['sale']]), total=30490))
        temp = pd.concat(ret_list,axis=0).sort_index()
        print(f'Features {name_feature} over!!')
    return temp

def parall_group_by_id_dayofweek_sale(df,func,n_cores,name_feature):
    with multiprocessing.Pool(n_cores) as p:
        ret_list = list(tqdm(p.imap(func, [group for name, group in df.groupby(['id','dayofweek'])['sale']]), total=30490*7))
        temp = pd.concat(ret_list,axis=0).sort_index()
        print(f'Features {name_feature} over!!')
    return temp

In [11]:
def feature_engineer(df,num_core,date1):
    """
    df format:sales_train_long_format
    """
    df = df.copy()
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    #df['week'] = df['date'].dt.week
    #df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    #df['is_quarter_start'] = df['date'].dt.is_quarter_start.astype(int)
    #df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)
    #df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    #df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    ### shift by day
    df['lag_sale_1'] = parall_group_by_id_sale(df,lag_sale_1,num_core,'lag_sale_1')
    df['lag_sale_2'] = parall_group_by_id_sale(df,lag_sale_2,num_core,'lag_sale_2')
    df['lag_sale_3'] = parall_group_by_id_sale(df,lag_sale_3,num_core,'lag_sale_3')
    df['lag_sale_4'] = parall_group_by_id_sale(df,lag_sale_4,num_core,'lag_sale_4')
    df['lag_sale_5'] = parall_group_by_id_sale(df,lag_sale_5,num_core,'lag_sale_5')
    df['lag_sale_6'] = parall_group_by_id_sale(df,lag_sale_6,num_core,'lag_sale_6')
    df['lag_sale_7'] = parall_group_by_id_sale(df,lag_sale_7,num_core,'lag_sale_7')
    ### rolling
    df['rolling_sale_3_mean'] = parall_group_by_id_sale(df,rolling_sale_3,num_core,'rolling_sale_3')
    df[['rolling_sale_7_mean','rolling_sale_7_std','rolling_sale_7_median']] \
                    = parall_group_by_id_sale(df,rolling_sale_7,num_core,'rolling_sale_7')
    df[['rolling_sale_14_mean','rolling_sale_14_std','rolling_sale_14_median']] \
                    = parall_group_by_id_sale(df,rolling_sale_14,num_core,'rolling_sale_14')
    df[['rolling_sale_28_mean','rolling_sale_28_std','rolling_sale_28_median']] \
                    = parall_group_by_id_sale(df,rolling_sale_28,num_core,'rolling_sale_28')
    df[['rolling_sale_91_mean','rolling_sale_91_std','rolling_sale_91_median']] \
                    = parall_group_by_id_sale(df,rolling_sale_91,num_core,'rolling_sale_91')
    df[['rolling_sale_182_mean','rolling_sale_182_std','rolling_sale_182_median']] \
                    = parall_group_by_id_sale(df,rolling_sale_182,num_core,'rolling_sale_182')
    df[['rolling_sale_364_mean','rolling_sale_364_std','rolling_sale_364_median']] \
                    = parall_group_by_id_sale(df,rolling_sale_364,num_core,'rolling_sale_364')
    for num in [7,14,28,91,182,364]:
        df[f'rolling_sale_{str(num)}_cv'] = df[f'rolling_sale_{str(num)}_std']/df[f'rolling_sale_{str(num)}_mean'] 
    ### quantile 25
    df['rolling_sale_7_quantile25'] = parall_group_by_id_sale(df,rolling_sale_7_quantile25,num_core,'rolling_sale_7_quantile25')
    df['rolling_sale_14_quantile25'] = parall_group_by_id_sale(df,rolling_sale_14_quantile25,num_core,'rolling_sale_14_quantile25')
    df['rolling_sale_28_quantile25'] = parall_group_by_id_sale(df,rolling_sale_28_quantile25,num_core,'rolling_sale28_quantile25')
    df['rolling_sale_91_quantile25'] = parall_group_by_id_sale(df,rolling_sale_91_quantile25,num_core,'rolling_sale_91_quantile25')
    df['rolling_sale_182_quantile25'] = parall_group_by_id_sale(df,rolling_sale_182_quantile25,num_core,'rolling_sale_182_quantile25')
    df['rolling_sale_364_quantile25'] = parall_group_by_id_sale(df,rolling_sale_364_quantile25,num_core,'rolling_sale_364_quantile25')
    ### quantile 75
    df['rolling_sale_7_quantile75'] = parall_group_by_id_sale(df,rolling_sale_7_quantile75,num_core,'rolling_sale_7_quantile75')
    df['rolling_sale_14_quantile75'] = parall_group_by_id_sale(df,rolling_sale_14_quantile75,num_core,'rolling_sale_14_quantile75')
    df['rolling_sale_28_quantile75'] = parall_group_by_id_sale(df,rolling_sale_28_quantile75,num_core,'rolling_sale28_quantile75')
    df['rolling_sale_91_quantile75'] = parall_group_by_id_sale(df,rolling_sale_91_quantile75,num_core,'rolling_sale_91_quantile75')
    df['rolling_sale_182_quantile75'] = parall_group_by_id_sale(df,rolling_sale_182_quantile75,num_core,'rolling_sale_182_quantile75')
    df['rolling_sale_364_quantile75'] = parall_group_by_id_sale(df,rolling_sale_364_quantile75,num_core,'rolling_sale_364_quantile75')
    df = reduce_mem_usage(df,False)
    ### dayofweek mean std sum
    df['rolling_sale_dayofweek_2_mean'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_2,num_core,'rolling_sale_dayofweek_2')
    df[['rolling_sale_dayofweek_4_mean','rolling_sale_dayofweek_4_std']] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_4,num_core,'rolling_sale_dayofweek_4')
    df[['rolling_sale_dayofweek_8_mean','rolling_sale_dayofweek_8_std','rolling_sale_dayofweek_8_median']] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_8,num_core,'rolling_sale_dayofweek_8')
    df[['rolling_sale_dayofweek_12_mean','rolling_sale_dayofweek_12_std','rolling_sale_dayofweek_12_median']] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_12,num_core,'rolling_sale_dayofweek_12')
    df[['rolling_sale_dayofweek_24_mean','rolling_sale_dayofweek_24_std','rolling_sale_dayofweek_24_median']] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_24,num_core,'rolling_sale_dayofweek_24')
    df[['rolling_sale_dayofweek_52_mean','rolling_sale_dayofweek_52_std','rolling_sale_dayofweek_52_median']] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_52,num_core,'rolling_sale_dayofweek_52')
    for num in [4,8,12,24,52]:
        df[f'rolling_sale_dayofweek_{str(num)}_cv'] = df[f'rolling_sale_dayofweek_{str(num)}_std']/df[f'rolling_sale_dayofweek_{str(num)}_mean']
        
    df = reduce_mem_usage(df,False)
    df['rolling_sale_dayofweek_8_quantile25'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_8_quantile25,num_core,'rolling_sale_dayofweek_8_quantile25')
    df['rolling_sale_dayofweek_12_quantile25'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_12_quantile25,num_core,'rolling_sale_dayofweek_12_quantile25')
    df['rolling_sale_dayofweek_24_quantile25'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_24_quantile25,num_core,'rolling_sale_dayofweek_24_quantile25')
    df['rolling_sale_dayofweek_52_quantile25'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_52_quantile25,num_core,'rolling_sale_dayofweek_52_quantile25')
    
    df['rolling_sale_dayofweek_8_quantile75'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_8_quantile75,num_core,'rolling_sale_dayofweek_8_quantile75')
    df['rolling_sale_dayofweek_12_quantile75'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_12_quantile75,num_core,'rolling_sale_dayofweek_12_quantile75')
    df['rolling_sale_dayofweek_24_quantile75'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_24_quantile75,num_core,'rolling_sale_dayofweek_24_quantile75')
    df['rolling_sale_dayofweek_52_quantile75'] = \
                parall_group_by_id_sale(df,rolling_sale_dayofweek_52_quantile75,num_core,'rolling_sale_dayofweek_52_quantile75')
    
    df = reduce_mem_usage(df,False)
    ### price
    ### shift 1
    for j in [1]:
        df[f'lag_price{j}'] = df.groupby(['id'])['sell_price'].transform(lambda x:x.shift(j*7))
        df[f'ratio_now_lag{j}price'] = df['sell_price']/df[f'lag_price{j}']
        print(j)


    return df

In [12]:
print(1)
train1 = feature_engineer(truncated_data,12,None)

1


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_1 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_2 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_3 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_4 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_5 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_6 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features lag_sale_7 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_3 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_7 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_28 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_91 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_182 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_364 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_7_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_14_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale28_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_91_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_182_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_364_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_7_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_14_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale28_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_91_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_182_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_364_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_2 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_4 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_8 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_12 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_24 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_52 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_8_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_12_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_24_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_52_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_8_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_12_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_24_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_sale_dayofweek_52_quantile75 over!!
1


In [22]:
gc.collect()

129

In [13]:
error_fea = ['rolling_sale_dayofweek_8_quantile25','rolling_sale_dayofweek_12_quantile25','rolling_sale_dayofweek_24_quantile25',
            'rolling_sale_dayofweek_52_quantile25','rolling_sale_dayofweek_8_quantile75','rolling_sale_dayofweek_12_quantile75',
            'rolling_sale_dayofweek_24_quantile75','rolling_sale_dayofweek_52_quantile75']

In [14]:
len(error_fea)

8

In [15]:
train1.drop(error_fea,axis=1,inplace=True)

In [18]:
def feature_engineer(df,num_core,date1):
    """
    df format:sales_train_long_format
    """
    
    df = df.copy()
    df['dayofweek'] = df['date'].dt.dayofweek
    df['rolling_sale_dayofweek_8_quantile25'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_8_quantile25,num_core,'rolling_sale_dayofweek_8_quantile25')
    df['rolling_sale_dayofweek_12_quantile25'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_12_quantile25,num_core,'rolling_sale_dayofweek_12_quantile25')
    df['rolling_sale_dayofweek_24_quantile25'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_24_quantile25,num_core,'rolling_sale_dayofweek_24_quantile25')
    df['rolling_sale_dayofweek_52_quantile25'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_52_quantile25,num_core,'rolling_sale_dayofweek_52_quantile25')
    
    df['rolling_sale_dayofweek_8_quantile75'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_8_quantile75,num_core,'rolling_sale_dayofweek_8_quantile75')
    df['rolling_sale_dayofweek_12_quantile75'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_12_quantile75,num_core,'rolling_sale_dayofweek_12_quantile75')
    df['rolling_sale_dayofweek_24_quantile75'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_24_quantile75,num_core,'rolling_sale_dayofweek_24_quantile75')
    df['rolling_sale_dayofweek_52_quantile75'] = \
                parall_group_by_id_dayofweek_sale(df,rolling_sale_dayofweek_52_quantile75,num_core,'rolling_sale_dayofweek_52_quantile75')
    
    df = reduce_mem_usage(df,False)



    return df

In [19]:
print(1)
train2 = feature_engineer(truncated_data,12,None)

1


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_8_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_12_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_24_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_52_quantile25 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_8_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_12_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_24_quantile75 over!!


HBox(children=(FloatProgress(value=0.0, max=213430.0), HTML(value='')))


Features rolling_sale_dayofweek_52_quantile75 over!!


In [20]:
train2[error_fea + ['id','day_num']].head()

Unnamed: 0,rolling_sale_dayofweek_8_quantile25,rolling_sale_dayofweek_12_quantile25,rolling_sale_dayofweek_24_quantile25,rolling_sale_dayofweek_52_quantile25,rolling_sale_dayofweek_8_quantile75,rolling_sale_dayofweek_12_quantile75,rolling_sale_dayofweek_24_quantile75,rolling_sale_dayofweek_52_quantile75,id,day_num
0,,,,,,,,,HOBBIES_1_001_CA_1_validation,1
1,,,,,,,,,HOBBIES_1_002_CA_1_validation,1
2,,,,,,,,,HOBBIES_1_003_CA_1_validation,1
3,,,,,,,,,HOBBIES_1_004_CA_1_validation,1
4,,,,,,,,,HOBBIES_1_005_CA_1_validation,1


In [None]:
pd.join()

In [22]:
del sales_train_long_format

In [23]:
gc.collect()

3098

In [24]:
del truncated_data

In [25]:
gc.collect()

2564

In [27]:
train1.shape

(60034810, 88)

In [28]:
train2.shape

(60034810, 27)

In [33]:
np.sum(train1.id == train2.id)

60034810

In [34]:
np.sum(train1.day_num == train2.day_num)

60034810

In [36]:
for i in error_fea:
    print(i)
    train1[i] = train2[i].copy()

rolling_sale_dayofweek_8_quantile25
rolling_sale_dayofweek_12_quantile25
rolling_sale_dayofweek_24_quantile25
rolling_sale_dayofweek_52_quantile25
rolling_sale_dayofweek_8_quantile75
rolling_sale_dayofweek_12_quantile75
rolling_sale_dayofweek_24_quantile75
rolling_sale_dayofweek_52_quantile75


In [None]:
train1.to_pickle('data_part1.pkl')

In [None]:
train2 = pd.read_pickle('data_part1.pkl')