In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler
# import os
# print(os.getcwd())

from tqdm.auto import tqdm
import lightgbm as lgb

In [3]:
train = pd.read_csv('../input/sales-predict/train.csv')
test = pd.read_csv('../input/sales-predict/test.csv')
sample_submit = pd.read_csv('../input/sales-predict/sample_submit.csv')
train['train']=1
test['train']=0
df = pd.concat([train,test],axis=0)
print(train.shape,test.shape,df.shape)

In [4]:
df2 = df.copy()
# 4. Price
group = df2.groupby(['item_id']).agg({'item_price':['median']})
group.columns = ["price_median"]
group.reset_index(inplace = True)
df2 = df2.merge(group, on = ['item_id'], how = "left" )
# df["price_median"] = df.price_median.astype(np.float16)
df2['item_price'] = df2['item_price'].fillna(df2['price_median'])
df2 = df2.drop('price_median',axis=1)
df2

In [5]:
def lag_feature_adv(df, lags, col):
    '''
    历史N周平移特征
    '''
    tmp = df[['week','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['week','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['week'] += i
        df = pd.merge(df, shifted, on=['week','shop_id','item_id'], how='left')
        df[col+'_lag_'+str(i)] = df[col+'_lag_'+str(i)]
    return df
df = lag_feature_adv(df, [1, 2, 3, 4], 'weekly_sales')
df2 = lag_feature_adv(df2, [1, 2, 3, 4], 'weekly_sales')

df

In [20]:
train = df[(df['week']<32) & (df['week']>3)].reset_index(drop=True)
train2 = df2[(df2['week']<32) & (df2['week']>3)].reset_index(drop=True)

train

In [7]:
test = df[df['week']==32].reset_index(drop=True)
test2 = df2[df2['week']==32].reset_index(drop=True)

test

In [8]:
seed0=2021
params0 = {
    'objective': 'mse',  # 自定义
    'boosting_type': 'gbdt',
#     'max_depth': -1,
#     'max_bin':100,
#     'min_data_in_leaf':500,
    'learning_rate': 0.05,
#     'subsample': 0.72,
#     'subsample_freq': 4,
#     'feature_fraction': 0.5,
#     'lambda_l1': 0.5,
#     'lambda_l2': 1.0,
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
#     'device':'gpu',
    'verbose': -1}


In [9]:
def train_and_evaluate_lgb(train, test, params):
    # Hyperparammeters (just basic)
    
    features = [col for col in train.columns if col not in {'train','weekly_sales'}]
    y = train['weekly_sales']
    # Create out of folds array
    oof_predictions = np.zeros(train.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(test.shape[0])
    # Create a KFold object
    

    kfold = KFold(n_splits = 5, random_state = 2021, shuffle = True)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_dataset = lgb.Dataset(x_train[features], y_train)
        val_dataset = lgb.Dataset(x_val[features], y_val)
        model = lgb.train(params = params,
                          num_boost_round = 10000,  # 1000
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          categorical_feature=['shop_id','item_id','item_category_id'],  # stock_id
                          verbose_eval = 200,
                          early_stopping_rounds=100)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val[features])
        # Predict the test set
        test_predictions += model.predict(test[features]) / 5
    mse_score = mean_squared_error(y, oof_predictions)
    print(f'Our out of folds MSE is {mse_score}')
    lgb.plot_importance(model,max_num_features=20)
    # Return test predictions
    return test_predictions, oof_predictions
# Traing and evaluate
test_pred_lgb1, valid_pred_lgb1 = train_and_evaluate_lgb(train, test, params0)

In [10]:
mean_squared_error(test_pred_lgb1,test['weekly_sales'])

# TabNet

In [11]:
!pip install pytorch-tabnet

In [12]:
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

In [13]:
features = [col for col in train.columns if col not in {'train','weekly_sales'}]
features

In [21]:
X = train2.drop(['train','weekly_sales'],axis=1)
y = train2['weekly_sales']

X_test = test2.drop(['train','weekly_sales'],axis=1)
y_test = test2['weekly_sales']
X

In [22]:
categorical_columns = []
categorical_dims =  {}
for col in X.columns:
    if  col in ['shop_id', 'item_id','item_category_id']:
        l_enc = LabelEncoder()
        X[col] = l_enc.fit_transform(X[col].values)
        X_test[col] = l_enc.transform(X_test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        scaler = StandardScaler()
        X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
        X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))
        
cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

In [23]:
cat_dims,cat_idxs

In [24]:
X

In [37]:
tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    n_d = 16, # 16
    n_a = 16, # 16
    n_steps = 2,
    gamma = 2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0, 
    optimizer_fn = Adam, 
    optimizer_params = dict(lr = (2e-2)),
    mask_type = "entmax", #"sparsemax"
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
#     pretraining_ratio=0.8,
#     device_name = 'cpu',
    verbose = 1
)

In [43]:
# # TabNetPretrainer
# unsupervised_model = TabNetPretrainer(
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2),
    
#     mask_type='entmax' # "sparsemax"
# )

# unsupervised_model.fit(
#     X_train=X,
#     eval_set=[X_test],
#     pretraining_ratio=0.8,
# )

In [38]:
kfold = KFold(n_splits = 5, random_state = 42, shuffle = True)
# Create out of folds array
oof_predictions = np.zeros((X.shape[0], 1))
test_predictions_tabnet = np.zeros(X_test.shape[0])
feature_importances = pd.DataFrame()
feature_importances["feature"] = X.columns.tolist()
# stats = pd.DataFrame()
explain_matrices = []
masks_ =[]

for fold, (trn_ind, val_ind) in enumerate(kfold.split(X)):
    print(f'Training fold {fold + 1}')
    X_train, X_val = X.iloc[trn_ind].values, X.iloc[val_ind].values
    y_train, y_val = y.iloc[trn_ind].values.reshape(-1,1), y.iloc[val_ind].values.reshape(-1,1)

    clf =  TabNetRegressor(**tabnet_params)
    clf.fit(
      X_train, y_train,
      eval_set=[(X_val, y_val)],
      max_epochs = 300,
      patience = 20,
      batch_size = 1024*10, 
      virtual_batch_size = 128*10, 
#       num_workers = 4,
      drop_last = False,
      eval_metric=['mse'],
#       loss_fn=mse
      )
    
    saving_path_name = f"./fold{fold}"
    saved_filepath = clf.save_model(saving_path_name)
    
    explain_matrix, masks = clf.explain(X_val)
    explain_matrices.append(explain_matrix)
    masks_.append(masks[0])
    masks_.append(masks[1])
      
    oof_predictions[val_ind] = clf.predict(X_val)
    test_predictions_tabnet+=clf.predict(X_test.values).flatten()/5
    feature_importances[f"importance_fold{fold}+1"] = clf.feature_importances_
    
#     stats[f'fold{fold+1}_train_rmspe']=clf.history['loss']
#     stats[f'fold{fold+1}_val_rmspe']=clf.history['val_0_rmspe']
    

In [39]:
print(f'OOF score across folds: {mean_squared_error(y, oof_predictions.flatten())}')

In [40]:
test_predictions_tabnet

In [42]:
mean_squared_error(y_test,test_predictions_tabnet)