In [4]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [5]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [6]:
pd.options.display.max_columns = 50

In [7]:
h = 28 
max_lags = 57

tr_last = 1913 + 28

fday = datetime(2016,4, 25) + timedelta(days= 28)
fday,tr_last,max_lags

(datetime.datetime(2016, 5, 23, 0, 0), 1941, 57)

In [21]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("C://Users/acherif003/Desktop/tp5/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("C://Users/acherif003/Desktop/tp5/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})

    dt = pd.read_csv("C://Users/acherif003/Desktop/tp5/sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [22]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }
        
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [23]:
FIRST_DAY = 350

In [26]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

Wall time: 30.1 s


(41571939, 22)

In [27]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.5
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 22 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory us

In [29]:
%%time

create_fea(df)
df.shape

  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


Wall time: 1min 55s


(41571939, 31)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 31 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
 22  lag_7         float32       
 23  lag_28        float32       
 

In [31]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97,,,,,,,2,1,13
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34,,,,,,,2,1,13
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48,,,,,,,2,1,13
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.5,,,,,,,2,1,13
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77,,,,,,,2,1,13


In [32]:
df.dropna(inplace = True)
df.shape

(39894989, 31)

In [33]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [35]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)

Wall time: 21.2 s


In [36]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

25

In [37]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [38]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[LightGBM] [Info] Total Bins 4602
[LightGBM] [Info] Number of data points in the train set: 37894989, number of used features: 25




[LightGBM] [Info] Start training from score 0.305582
[20]	valid_0's rmse: 2.95888
[40]	valid_0's rmse: 2.63236
[60]	valid_0's rmse: 2.54629
[80]	valid_0's rmse: 2.52169
[100]	valid_0's rmse: 2.51112
[120]	valid_0's rmse: 2.50086
[140]	valid_0's rmse: 2.49124
[160]	valid_0's rmse: 2.48516
[180]	valid_0's rmse: 2.4768
[200]	valid_0's rmse: 2.47117
[220]	valid_0's rmse: 2.46399
[240]	valid_0's rmse: 2.45845
[260]	valid_0's rmse: 2.45222
[280]	valid_0's rmse: 2.44791
[300]	valid_0's rmse: 2.44403
[320]	valid_0's rmse: 2.43979
[340]	valid_0's rmse: 2.4361
[360]	valid_0's rmse: 2.43245
[380]	valid_0's rmse: 2.42863
[400]	valid_0's rmse: 2.42423
[420]	valid_0's rmse: 2.42244
[440]	valid_0's rmse: 2.41935
[460]	valid_0's rmse: 2.41715
[480]	valid_0's rmse: 2.41414
[500]	valid_0's rmse: 2.41136
[520]	valid_0's rmse: 2.40997
[540]	valid_0's rmse: 2.40803
[560]	valid_0's rmse: 2.40701
[580]	valid_0's rmse: 2.4053
[600]	valid_0's rmse: 2.40375
[620]	valid_0's rmse: 2.40108
[640]	valid_0's rmse: 2.

In [39]:
m_lgb.save_model("model.lgb")

<lightgbm.basic.Booster at 0x2442146f2b0>

In [43]:
%%time

alphas = [1.023, 1.018, 1.013] 
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst)

    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)

sub2 = pd.read_csv("C://Users/acherif003/Desktop/tp5/sales_train_evaluation.csv", usecols = ["id"]+ [f"d_{i}" for i in range(1914, 1914+28)])
sub2.rename(columns = {f"d_{i}": f'F{i-1913}' for i in range(1914, 1914+28)}, inplace=True)
sub2["id"] = sub2["id"].str.replace("evaluation", "validation")

sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("Output.csv",index=False)
print(sub.shape)

0 2016-05-23 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


1 2016-05-24 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


2 2016-05-25 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


3 2016-05-26 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


4 2016-05-27 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


5 2016-05-28 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


6 2016-05-29 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


7 2016-05-30 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


8 2016-05-31 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


9 2016-06-01 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


10 2016-06-02 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


11 2016-06-03 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


12 2016-06-04 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


13 2016-06-05 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


14 2016-06-06 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


15 2016-06-07 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


16 2016-06-08 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


17 2016-06-09 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


18 2016-06-10 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


19 2016-06-11 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


20 2016-06-12 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


21 2016-06-13 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


22 2016-06-14 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


23 2016-06-15 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


24 2016-06-16 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


25 2016-06-17 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


26 2016-06-18 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


27 2016-06-19 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


0 1.023 0.3333333333333333
0 2016-05-23 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


1 2016-05-24 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


2 2016-05-25 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


3 2016-05-26 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


4 2016-05-27 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


5 2016-05-28 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


6 2016-05-29 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


7 2016-05-30 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


8 2016-05-31 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


9 2016-06-01 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


10 2016-06-02 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


11 2016-06-03 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


12 2016-06-04 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


13 2016-06-05 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


14 2016-06-06 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


15 2016-06-07 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


16 2016-06-08 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


17 2016-06-09 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


18 2016-06-10 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


19 2016-06-11 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


20 2016-06-12 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


21 2016-06-13 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


22 2016-06-14 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


23 2016-06-15 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


24 2016-06-16 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


25 2016-06-17 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


26 2016-06-18 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


27 2016-06-19 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


1 1.018 0.3333333333333333
0 2016-05-23 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


1 2016-05-24 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


2 2016-05-25 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


3 2016-05-26 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


4 2016-05-27 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


5 2016-05-28 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


6 2016-05-29 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


7 2016-05-30 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


8 2016-05-31 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


9 2016-06-01 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


10 2016-06-02 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


11 2016-06-03 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


12 2016-06-04 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


13 2016-06-05 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


14 2016-06-06 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


15 2016-06-07 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


16 2016-06-08 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


17 2016-06-09 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


18 2016-06-10 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


19 2016-06-11 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


20 2016-06-12 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


21 2016-06-13 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


22 2016-06-14 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


23 2016-06-15 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


24 2016-06-16 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


25 2016-06-17 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


26 2016-06-18 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


27 2016-06-19 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


2 1.013 0.3333333333333333
(60980, 29)
Wall time: 44min 24s


In [44]:
sub.head(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,0.8659,0.779055,0.769768,0.840996,0.89504,1.096308,0.955326,0.829218,0.789299,0.814155,0.827505,1.056479,1.173147,1.103428,0.826162,0.796831,0.791655,0.805542,0.952256,1.142077,1.085972,0.790117,0.754593,0.763986,0.764514,0.866599,1.093809,0.991384
1,FOODS_1_001_CA_2_evaluation,0.793123,0.808093,0.724637,0.703203,0.906018,1.238061,0.901941,0.912592,0.774597,0.864552,0.928318,1.10941,1.229035,1.147578,0.916493,0.922017,0.915004,0.927963,1.086141,1.321644,1.125077,0.796598,0.792116,0.800336,0.757669,0.905769,1.187528,1.084849
2,FOODS_1_001_CA_3_evaluation,0.832919,0.790864,0.772218,0.724634,1.109491,1.258044,1.252173,0.976988,0.957538,0.96967,0.988214,1.138735,1.244104,1.241635,0.989509,0.9928,0.974018,0.980863,1.08186,1.345778,1.256159,1.00829,1.159358,1.178789,0.953808,0.986663,1.223144,1.221938
3,FOODS_1_001_CA_4_evaluation,0.298126,0.295922,0.316381,0.334413,0.345907,0.374495,0.333168,0.298876,0.321787,0.331031,0.329837,0.384783,0.382452,0.381945,0.339943,0.327976,0.339271,0.359506,0.376145,0.415718,0.421917,0.337733,0.33755,0.356003,0.364019,0.369468,0.392044,0.362384
4,FOODS_1_001_TX_1_evaluation,0.484101,0.686303,0.461622,0.571332,0.679852,0.73984,0.744418,0.611777,0.540403,0.691124,0.663461,0.760012,0.86711,0.810766,0.670739,0.671556,0.66881,0.730181,0.766826,0.948054,0.864947,0.709272,0.699444,0.702335,0.698409,0.760584,0.936244,0.895738
5,FOODS_1_001_TX_2_evaluation,0.401292,0.386367,0.381627,0.340092,0.406053,0.480342,0.457993,0.372534,0.383351,0.456865,0.43989,0.56539,0.581329,0.502756,0.426738,0.420675,0.412262,0.444567,0.47211,0.585332,0.496193,0.433239,0.414244,0.407627,0.407409,0.458267,0.526023,0.457287
6,FOODS_1_001_TX_3_evaluation,0.420917,0.385279,0.433967,0.429826,0.484719,0.516253,0.445542,0.390729,0.404836,0.522198,0.508594,0.597796,0.584945,0.55084,0.447284,0.447931,0.471883,0.466875,0.510211,0.573242,0.523884,0.444768,0.434753,0.465112,0.417649,0.468962,0.525011,0.458974
7,FOODS_1_001_WI_1_evaluation,0.483635,0.462978,0.469422,0.47091,0.561769,0.629048,0.548744,0.456479,0.473542,0.521089,0.528193,0.656301,0.690126,0.632182,0.483312,0.505332,0.51029,0.524832,0.633027,0.766535,0.692904,0.521105,0.505051,0.517021,0.530618,0.614515,0.659239,0.547168
8,FOODS_1_001_WI_2_evaluation,0.432714,0.441159,0.613764,0.553207,0.69241,0.749952,0.71234,0.53621,0.584919,0.729156,0.790405,1.017887,0.970879,0.846698,0.757457,0.683392,0.780547,0.804006,0.78888,0.881303,0.803928,0.656911,0.764046,0.799845,0.702475,0.758308,0.833904,0.769908
9,FOODS_1_001_WI_3_evaluation,0.278774,0.280854,0.284412,0.308033,0.352317,0.370674,0.324702,0.274988,0.30116,0.34543,0.337359,0.446665,0.429557,0.387248,0.316527,0.302452,0.327821,0.334349,0.37514,0.430318,0.393836,0.313082,0.33702,0.364711,0.339579,0.365402,0.384073,0.312037


In [45]:
sub.id.nunique(), sub["id"].str.contains("evaluation$").sum()

(60980, 30490)