In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit,StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold,train_test_split,GroupShuffleSplit,StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score,mean_squared_error,mean_absolute_error,log_loss,confusion_matrix
import sqlite3
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
#from bayes_opt import BayesianOptimization
import re
from string import punctuation
from scipy.spatial import Voronoi
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
from tqdm.notebook import tqdm
from numba import jit
from collections import Counter
import json
import joblib
import multiprocessing
import time

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int32', 'int64', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
sample_submission = pd.read_csv('../data/sample_submission.csv')
sell_prices = pd.read_csv('../data/sell_prices.csv')
sales_train = pd.read_csv('../data/sales_train_validation.csv')
calendar = pd.read_csv('../data/calendar.csv')

In [4]:
####扩展sales_train df方便后续
#sales_train = pd.read_csv('../data/sales_train_validation.csv')
for _ in [f'd_{i}' for i in range(1914,1970)]:
    sales_train[_] = np.nan

In [5]:
sales_train_long_format = pd.melt(sales_train,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name = 'day_num',value_name='sale')

In [6]:
def transform_day_to_num(str1):
    return int(str1[2:])
sales_train_long_format['day_num'] = sales_train_long_format['day_num'].map(transform_day_to_num)
calendar['date'] = pd.to_datetime(calendar['date'])
calendar['day_num'] = calendar['d'].map(transform_day_to_num)
map_day_date = calendar[['date','day_num']].set_index('day_num')['date']
sales_train_long_format['date'] = sales_train_long_format['day_num'].map(map_day_date)
list1 = ['wm_yr_wk','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI','day_num']
sales_train_long_format = sales_train_long_format.merge(calendar[list1],on='day_num',how='left')

In [7]:
sales_train_long_format = sales_train_long_format.merge(sell_prices,how='left',on = ['store_id','item_id','wm_yr_wk'])
###把没卖的变成nan
sales_train_long_format.loc[pd.isna(sales_train_long_format.sell_price),'sale'] = np.nan
sales_train_long_format = reduce_mem_usage(sales_train_long_format)

Mem. usage decreased to 6526.92 Mb (25.0% reduction)


### part2
- non_zeros features
- cnt_zeros 
- nonz-zeros
- non-zeros -jiange 有了

In [8]:
def parall_group_by_id_sale(df,func,n_cores,name_feature):
    with multiprocessing.Pool(n_cores) as p:
        ret_list = list(tqdm(p.imap(func, [group for name, group in df.groupby(['id'])['sale']]), total=30490))
        temp = pd.concat(ret_list,axis=0).sort_index()
        print(f'Features {name_feature} over!!')
    return temp
def parall_group_by_id_dayofweek_sale(df,func,n_cores,name_feature):
    with multiprocessing.Pool(n_cores) as p:
        ret_list = list(tqdm(p.imap(func, [group for name, group in df.groupby(['id','dayofweek'])['sale']]), total=30490*7))
        temp = pd.concat(ret_list,axis=0).sort_index()
        print(f'Features {name_feature} over!!')
    return temp

In [9]:
def non_zero_features_mean(array,lag_days):
    if len(array) ==0:
        return np.nan
    array = array[(array!=0)&(~pd.isna(array))]
    if len(array) <=3:
        return np.nan
    return np.mean(array[-lag_days:])

def non_zero_features_std(array,lag_days):
    if len(array) ==0:
        return np.nan
    array = array[(array!=0)&(~pd.isna(array))]
    if len(array) <=3:
        return np.nan
    return np.std(array[-lag_days:])

def non_zero_features_quantile25(array,lag_days):
    if len(array) ==0:
        return np.nan
    array = array[(array!=0)&(~pd.isna(array))]
    if len(array) <=3:
        return np.nan
    return np.quantile(array[-lag_days:],0.25)

def non_zero_features_quantile75(array,lag_days):
    if len(array) ==0:
        return np.nan
    array = array[(array!=0)&(~pd.isna(array))]
    if len(array) <=3:
        return np.nan
    return np.quantile(array[-lag_days:],0.75)

def non_zero_features_min(array,lag_days):
    if len(array) ==0:
        return np.nan
    array = array[(array!=0)&(~pd.isna(array))]
    if len(array) <=3:
        return np.nan
    return np.min(array[-lag_days:])

def non_zero_features_max(array,lag_days):
    if len(array) ==0:
        return np.nan
    array = array[(array!=0)&(~pd.isna(array))]
    if len(array) <=3:
        return np.nan
    return np.max(array[-lag_days:])


In [10]:
def rolling_id_non_zero_mean_7(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_mean(x,7),raw=True)
def rolling_id_non_zero_std_7(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_std(x,7),raw=True)
def rolling_id_non_zero_quantile25_7(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_quantile25(x,7),raw=True)
def rolling_id_non_zero_quantile75_7(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_quantile75(x,7),raw=True)
def rolling_id_non_zero_max_7(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_max(x,7),raw=True)
def rolling_id_non_zero_min_7(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_min(x,7),raw=True)

def rolling_id_non_zero_mean_14(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_mean(x,14),raw=True)
def rolling_id_non_zero_std_14(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_std(x,14),raw=True)
def rolling_id_non_zero_quantile25_14(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_quantile25(x,14),raw=True)
def rolling_id_non_zero_quantile75_14(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_quantile75(x,14),raw=True)
def rolling_id_non_zero_max_14(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_max(x,14),raw=True)
def rolling_id_non_zero_min_14(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_min(x,14),raw=True)

def rolling_id_non_zero_mean_28(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_mean(x,28),raw=True)
def rolling_id_non_zero_std_28(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_std(x,28),raw=True)
def rolling_id_non_zero_quantile25_28(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_quantile25(x,28),raw=True)
def rolling_id_non_zero_quantile75_28(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_quantile75(x,28),raw=True)
def rolling_id_non_zero_max_28(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_max(x,28),raw=True)
def rolling_id_non_zero_min_28(x):
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_min(x,28),raw=True)

In [11]:
def rolling_id_non_zero_dayofweek_mean_6(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_mean(x,6),raw=True)
def rolling_id_non_zero_dayofweek_std_6(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_std(x,6),raw=True)
def rolling_id_non_zero_dayofweek_quantile25_6(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_quantile25(x,6),raw=True)
def rolling_id_non_zero_dayofweek_quantile75_6(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_quantile75(x,6),raw=True)
def rolling_id_non_zero_dayofweek_max_6(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_max(x,6),raw=True)
def rolling_id_non_zero_dayofweek_min_6(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_min(x,6),raw=True)

def rolling_id_non_zero_dayofweek_mean_12(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_mean(x,12),raw=True)
def rolling_id_non_zero_dayofweek_std_12(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_std(x,12),raw=True)
def rolling_id_non_zero_dayofweek_quantile25_12(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_quantile25(x,12),raw=True)
def rolling_id_non_zero_dayofweek_quantile75_12(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_quantile75(x,12),raw=True)
def rolling_id_non_zero_dayofweek_max_12(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_max(x,12),raw=True)
def rolling_id_non_zero_dayofweek_min_12(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_min(x,12),raw=True)

def rolling_id_non_zero_dayofweek_mean_18(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_mean(x,18),raw=True)
def rolling_id_non_zero_dayofweek_std_18(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_std(x,18),raw=True)
def rolling_id_non_zero_dayofweek_quantile25_18(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_quantile25(x,18),raw=True)
def rolling_id_non_zero_dayofweek_quantile75_18(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_quantile75(x,18),raw=True)
def rolling_id_non_zero_dayofweek_max_18(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_max(x,18),raw=True)
def rolling_id_non_zero_dayofweek_min_18(x):
    return x.shift(1).rolling(56,10).apply(lambda x:non_zero_features_min(x,18),raw=True)

In [12]:
def zero_transform_ratio(array):
    array = array[list(~pd.isna(array))]
    if len(array) ==0:
        return np.nan
    te1 = (array[:-1]==0) * ((array[1:]>0))
    te2 = (array[:-1]>0) * ((array[1:]==0))
    te = te1+te2
    return np.mean(te.astype(bool))
def rolling_sale_zero_trans_7(x):
    return x.shift(1).rolling(7,3).apply(lambda x:zero_transform_ratio(x),raw=True)
def rolling_sale_zero_trans_14(x):
    return x.shift(1).rolling(14,6).apply(lambda x:zero_transform_ratio(x),raw=True)
def rolling_sale_zero_trans_28(x):
    return x.shift(1).rolling(28,10).apply(lambda x:zero_transform_ratio(x),raw=True)
def rolling_sale_zero_trans_56(x):
    return x.shift(1).rolling(56,10).apply(lambda x:zero_transform_ratio(x),raw=True)


In [13]:
def cnt_zero_ratio(array):
    array = array[list(~pd.isna(array))]
    if len(array) ==0:
        return np.nan
    return np.mean(array==0)
def rolling_sale_ratio_zero_7(x):
    return x.shift(1).rolling(7,3).apply(lambda x:cnt_zero_ratio(x),raw=True)
def rolling_sale_ratio_zero_14(x):
    return x.shift(1).rolling(14,6).apply(lambda x:cnt_zero_ratio(x),raw=True)
def rolling_sale_ratio_zero_28(x):
    return x.shift(1).rolling(28,10).apply(lambda x:cnt_zero_ratio(x),raw=True)
def rolling_sale_ratio_zero_56(x):
    return x.shift(1).rolling(56,21).apply(lambda x:cnt_zero_ratio(x),raw=True)


In [14]:
def feature_engineer(df,num_core,date1):
    """
    df format:sales_train_long_format
    """
    df = df.copy()
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek
#     df['rolling_id_non_zero_mean_7'] = parall_group_by_id_sale(df,rolling_id_non_zero_mean_7,num_core,'rolling_id_non_zero_mean_7')
#     df['rolling_id_non_zero_std_7'] = parall_group_by_id_sale(df,rolling_id_non_zero_std_7,num_core,'rolling_id_non_zero_std_7')
#     df['rolling_id_non_zero_quantile25_7'] = parall_group_by_id_sale(df,rolling_id_non_zero_quantile25_7,num_core,'rolling_id_non_zero_quantile25_7')
#     df['rolling_id_non_zero_quantile75_7'] = parall_group_by_id_sale(df,rolling_id_non_zero_quantile75_7,num_core,'rolling_id_non_zero_quantile75_7')
#     df['rolling_id_non_zero_max_7'] = parall_group_by_id_sale(df,rolling_id_non_zero_max_7,num_core,'rolling_id_non_zero_max_7')
#     df['rolling_id_non_zero_min_7'] = parall_group_by_id_sale(df,rolling_id_non_zero_min_7,num_core,'rolling_id_non_zero_min_7')
    
    df['rolling_id_non_zero_mean_14'] = parall_group_by_id_sale(df,rolling_id_non_zero_mean_14,num_core,'rolling_id_non_zero_mean_14')
    df['rolling_id_non_zero_std_14'] = parall_group_by_id_sale(df,rolling_id_non_zero_std_14,num_core,'rolling_id_non_zero_std_14')
    df['rolling_id_non_zero_quantile25_14'] = parall_group_by_id_sale(df,rolling_id_non_zero_quantile25_14,num_core,'rolling_id_non_zero_quantile25_14')
    df['rolling_id_non_zero_quantile75_14'] = parall_group_by_id_sale(df,rolling_id_non_zero_quantile75_14,num_core,'rolling_id_non_zero_quantile75_14')
    df['rolling_id_non_zero_max_14'] = parall_group_by_id_sale(df,rolling_id_non_zero_max_14,num_core,'rolling_id_non_zero_max_14')
    df['rolling_id_non_zero_min_14'] = parall_group_by_id_sale(df,rolling_id_non_zero_min_14,num_core,'rolling_id_non_zero_min_14')
    df = reduce_mem_usage(df,False)
    df['rolling_id_non_zero_mean_28'] = parall_group_by_id_sale(df,rolling_id_non_zero_mean_28,num_core,'rolling_id_non_zero_mean_28')
    df['rolling_id_non_zero_std_28'] = parall_group_by_id_sale(df,rolling_id_non_zero_std_28,num_core,'rolling_id_non_zero_std_28')
    df['rolling_id_non_zero_quantile25_28'] = parall_group_by_id_sale(df,rolling_id_non_zero_quantile25_28,num_core,'rolling_id_non_zero_quantile25_28')
    df['rolling_id_non_zero_quantile75_28'] = parall_group_by_id_sale(df,rolling_id_non_zero_quantile75_28,num_core,'rolling_id_non_zero_quantile75_28')
    df['rolling_id_non_zero_max_28'] = parall_group_by_id_sale(df,rolling_id_non_zero_max_28,num_core,'rolling_id_non_zero_max_28')
    df['rolling_id_non_zero_min_28'] = parall_group_by_id_sale(df,rolling_id_non_zero_min_28,num_core,'rolling_id_non_zero_min_28')
    
    df['rolling_id_non_zero_dayofweek_mean_6'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_mean_6,num_core,'rolling_id_non_zero_dayofweek_mean_6')
    df['rolling_id_non_zero_dayofweek_std_6'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_std_6,num_core,'rolling_id_non_zero_dayofweek_std_6')
    df['rolling_id_non_zero_dayofweek_quantile25_6'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_quantile25_6,num_core,'rolling_id_non_zero_dayofweek_quantile25_6')
    df['rolling_id_non_zero_dayofweek_quantile75_6'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_quantile75_6,num_core,'rolling_id_non_zero_dayofweek_quantile75_6')
    df['rolling_id_non_zero_dayofweek_max_6'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_max_6,num_core,'rolling_id_non_zero_dayofweek_max_6')
    df['rolling_id_non_zero_dayofweek_min_6'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_min_6,num_core,'rolling_id_non_zero_dayofweek_min_6')
    df = reduce_mem_usage(df,False)
    df['rolling_id_non_zero_dayofweek_mean_12'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_mean_12,num_core,'rolling_id_non_zero_dayofweek_mean_12')
    df['rolling_id_non_zero_dayofweek_std_12'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_std_12,num_core,'rolling_id_non_zero_dayofweek_std_12')
    df['rolling_id_non_zero_dayofweek_quantile25_12'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_quantile25_12,num_core,'rolling_id_non_zero_dayofweek_quantile25_12')
    df['rolling_id_non_zero_dayofweek_quantile75_12'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_quantile75_12,num_core,'rolling_id_non_zero_dayofweek_quantile75_12')
    df['rolling_id_non_zero_dayofweek_max_12'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_max_12,num_core,'rolling_id_non_zero_dayofweek_max_12')
    df['rolling_id_non_zero_dayofweek_min_12'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_min_12,num_core,'rolling_id_non_zero_dayofweek_min_12')
    
#     df['rolling_id_non_zero_dayofweek_mean_18'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_mean_18,num_core,'rolling_id_non_zero_dayofweek_mean_18')
#     df['rolling_id_non_zero_dayofweek_std_18'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_std_18,num_core,'rolling_id_non_zero_dayofweek_std_18')
#     df['rolling_id_non_zero_dayofweek_quantile25_18'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_quantile25_18,num_core,'rolling_id_non_zero_dayofweek_quantile25_18')
#     df['rolling_id_non_zero_dayofweek_quantile75_18'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_quantile75_18,num_core,'rolling_id_non_zero_dayofweek_quantile75_18')
#     df['rolling_id_non_zero_dayofweek_max_18'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_max_18,num_core,'rolling_id_non_zero_dayofweek_max_18')
#     df['rolling_id_non_zero_dayofweek_min_18'] = parall_group_by_id_dayofweek_sale(df,rolling_id_non_zero_dayofweek_min_18,num_core,'rolling_id_non_zero_dayofweek_min_18')
       
#     df['rolling_sale_zero_trans_7'] = parall_group_by_id_sale(df,rolling_sale_zero_trans_7,num_core,'rolling_sale_zero_trans_7')
#     df['rolling_sale_zero_trans_14'] = parall_group_by_id_sale(df,rolling_sale_zero_trans_14,num_core,'rolling_sale_zero_trans_14')
#     df['rolling_sale_zero_trans_28'] = parall_group_by_id_sale(df,rolling_sale_zero_trans_28,num_core,'rolling_sale_zero_trans_28')
#     df['rolling_sale_zero_trans_56'] = parall_group_by_id_sale(df,rolling_sale_zero_trans_56,num_core,'rolling_sale_zero_trans_56')
#     df['rolling_sale_ratio_zero_7'] = parall_group_by_id_sale(df,rolling_sale_ratio_zero_7,num_core,'rolling_sale_ratio_zero_7')
#     df['rolling_sale_ratio_zero_14'] = parall_group_by_id_sale(df,rolling_sale_ratio_zero_14,num_core,'rolling_sale_ratio_zero_14')
#     df['rolling_sale_ratio_zero_28'] = parall_group_by_id_sale(df,rolling_sale_ratio_zero_28,num_core,'rolling_sale_ratio_zero_28')
#     df['rolling_sale_ratio_zero_56'] = parall_group_by_id_sale(df,rolling_sale_ratio_zero_56,num_core,'rolling_sale_ratio_zero_56')
    df = reduce_mem_usage(df,False)
    return df

In [15]:
print(1)
train1 = feature_engineer(sales_train_long_format,8,None)

1


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_mean_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_std_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_quantile25_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_quantile75_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_max_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_min_14 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Features rolling_id_non_zero_mean_28 over!!


HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))

Process ForkPoolWorker-64:
Process ForkPoolWorker-61:
Process ForkPoolWorker-62:
Process ForkPoolWorker-58:
Process ForkPoolWorker-59:
Process ForkPoolWorker-60:
Process ForkPoolWorker-57:
Process ForkPoolWorker-63:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "<ipython-input-10-3a77ddf5cb63>", line 30, in rolling_id_non_zero_std_28
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_std(x,28),raw=True)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/window.py", line 1703, in apply
    func, raw=raw, args=args, kwargs=kwargs)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/window.py", line 1012, in apply
    c




  File "/opt/conda/lib/python3.7/site-packages/pandas/core/window.py", line 880, in _apply
    result = calc(values)
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/window.py", line 874, in calc
    closed=self.closed)
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-10-3a77ddf5cb63>", line 30, in rolling_id_non_zero_std_28
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_std(x,28),raw=True)
  File "/opt/conda/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*a

  File "/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py", line 3242, in std
    **kwargs)
  File "<ipython-input-10-3a77ddf5cb63>", line 30, in <lambda>
    return x.shift(1).rolling(112,10).apply(lambda x:non_zero_features_std(x,28),raw=True)
  File "<ipython-input-9-ad84adc2329d>", line 15, in non_zero_features_std
    return np.std(array[-lag_days:])
  File "/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py", line 3242, in std
    **kwargs)
  File "<ipython-input-9-ad84adc2329d>", line 15, in non_zero_features_std
    return np.std(array[-lag_days:])
  File "/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py", line 140, in _std
    keepdims=keepdims)
  File "/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py", line 3242, in std
    **kwargs)
  File "<ipython-input-9-ad84adc2329d>", line 15, in non_zero_features_std
    return np.std(array[-lag_days:])
  File "/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py", line

KeyboardInterrupt: 

In [None]:
train1.to_pickle('data_part3.pkl')