In [1]:
!pip install statsmodels catboost xgboost lightgbm -q

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.contrib import tzip
from tqdm.auto import tqdm
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from catboost import Pool, cv, CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings
import joblib
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
tqdm.pandas()
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Load data

In [3]:
train_data = pd.read_csv('/kaggle/input/ioai-contest-1/train.csv')
sample_sub = pd.read_csv('/kaggle/input/ioai-contest-1/sample_submission.csv')

In [4]:
sample_sub

Unnamed: 0,Id,Target
0,0_1_2024-01,0
1,0_2_2024-01,0
2,0_3_2024-01,0
3,0_5_2024-01,0
4,0_6_2024-01,0
...,...,...
113851,3_14664_2024-03,0
113852,3_14665_2024-03,0
113853,3_14666_2024-03,0
113854,3_14667_2024-03,0


# Data preprocessing

We can't transform date to pd.datetime type because of 29 february😭😭😭😭😭😭 so we transform date to year, month and all_date index

In [5]:
train_data['year'] = train_data['Date'].apply(lambda x: int(x.split('-')[0]) - 2019)
train_data['month'] = train_data['Date'].apply(lambda x: int(x.split('-')[1]) - 1)
train_data['all_date'] = train_data['year'] * 12 + train_data['month']

## Train and val split

In [6]:
def prepare_data(sample_submit):
    df = pd.DataFrame()
    df['Company_ID'] = sample_submit['Id'].apply(lambda x: int(x.split('_')[0]) )
    df['Product_ID'] = sample_submit['Id'].apply(lambda x: int(x.split('_')[1]) )
    df['Date'] = sample_submit['Id'].apply(lambda x: x.split('_')[2] )
    df['year'] = df['Date'].apply(lambda x: int(x.split('-')[0]) - 2019)
    df['month'] = df['Date'].apply(lambda x: int(x.split('-')[1]) - 1)
    df['all_date'] = df['year'] * 12 + df['month']
    return df

valid_days = [57,58,59]
train_days = [54,55,56]

val_targets = train_data[train_data['all_date'].isin(valid_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
train_targets = train_data[train_data['all_date'].isin(train_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
all_targets = prepare_data(sample_sub)

In [7]:
dey_to_lags = 50
an_train_days = list(range(min(train_days) - dey_to_lags - 1,min(train_days)))
an_val_days = list(range(min(valid_days) - dey_to_lags - 1,min(valid_days) ))
an_all_days = list(range(60 - dey_to_lags - 1,60))

train_feature_data = train_data[train_data['all_date'].isin(an_train_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
val_feature_data = train_data[train_data['all_date'].isin(an_val_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
all_feature_data = train_data[train_data['all_date'].isin(an_all_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()

## Fill empty rows by zeros

In [8]:
def zero_fill(data,id_maper):
    grp_data = data.groupby(['Company_ID','all_date'])['Product_ID'].apply(list)
    fill_df = []
    for (cmp_id, date_id), products in tzip(grp_data.index,grp_data):
        all_ids = id_maper[cmp_id]
        for i in all_ids:
            if i not in products:
                fill_df.append({
                    'Company_ID': cmp_id,
                    'Product_ID': i,
                    'all_date': date_id,
                    'Target': 0
                })
    fill_df = pd.DataFrame(fill_df)
    data = pd.concat([data,fill_df],axis=0,ignore_index=True).sort_values(by=['all_date','Company_ID','Product_ID'])
    return data

id_maper_train = train_data[train_data['all_date'] < min(train_days)].groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()
id_maper_val = train_data[train_data['all_date'] < min(valid_days)].groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()
id_maper_all = train_data.groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()
#id_maper = train_data.groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()


val_targets = zero_fill(val_targets,id_maper_val)
train_targets = zero_fill(train_targets,id_maper_train)
all_targets = zero_fill(all_targets,id_maper_all)

train_feature_data = zero_fill(train_feature_data,id_maper_train)
val_feature_data = zero_fill(val_feature_data,id_maper_val)
all_feature_data = zero_fill(all_feature_data,id_maper_all)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

In [9]:
train_feature_data['all_date'] -= train_feature_data['all_date'].min()
val_feature_data['all_date'] -= val_feature_data['all_date'].min()
all_feature_data['all_date'] -= all_feature_data['all_date'].min()

train_feature_data = train_feature_data.set_index(['Company_ID','Product_ID','all_date'])
val_feature_data = val_feature_data.set_index(['Company_ID','Product_ID','all_date'])
all_feature_data = all_feature_data.set_index(['Company_ID','Product_ID','all_date'])

## Get lags

In [10]:
def get_lags(targets_data, feature_data,lags_range=[0,1,2,3,4,5,6,7,8,9,10],fil_val=-100):
    lags_df = []
    for cmp_id, product_id in tzip(targets_data['Company_ID'],targets_data['Product_ID']):
        try:
            row = feature_data.loc[(cmp_id,product_id)].to_dict()['Target']
            feats = {}
            for i in lags_range:
                if i in row:
                    feats[i] = row[i]
                else:
                    feats[i] = fil_val
        except:
           feats = {} 
           for i in lags_range:
               feats[i] = fil_val
        lags_df.append(feats)
    lags_df = pd.DataFrame(lags_df)
    lags_df.columns = [f'lag_{i}' for i in lags_range]
    return lags_df

def get_fast_lags(targets_data, feature_data,lags_range=[0,1,2,3,4,5,6,7,8,9,10],fil_val=-100):
    feature_data = feature_data.copy().reset_index()
    feature_data = pd.pivot_table(
        feature_data,
        index=['Company_ID','Product_ID'],
        columns=['all_date'],
        values=['Target'],
        aggfunc='first',
        fill_value=fil_val
    )
    feature_data.columns = [f'lag_{i}' for i in lags_range]
    return targets_data.merge(feature_data,on=['Company_ID','Product_ID'])

In [11]:
lags_range = list(range(dey_to_lags+1))
train_targets = get_fast_lags(train_targets,train_feature_data,lags_range=lags_range)
val_targets = get_fast_lags(val_targets,val_feature_data,lags_range=lags_range)
all_targets = get_fast_lags(all_targets,all_feature_data,lags_range=lags_range)

In [12]:
train_targets['all_date'] -= train_targets['all_date'].min()
val_targets['all_date'] -= val_targets['all_date'].min()
all_targets['all_date'] -= all_targets['all_date'].min()

# Model training

In [13]:
cb_params = {
    'iterations': 3000,
    'learning_rate': 0.05,
    'loss_function': 'MAE',
    'max_depth': 5,
    'eval_metric': 'MAE',
    'use_best_model': False,
    'task_type': 'CPU',
    'random_seed': 56,
}

cat_cols = ['Company_ID','all_date']
label_col = 'Target'
drop_cols = [
    'Product_ID'
]
text_features = None

In [14]:
train_pool = Pool(
    data = train_targets.drop(drop_cols+[label_col],axis=1),
    label = train_targets[label_col],
    cat_features=cat_cols,
    text_features=text_features
)

eval_pool = Pool(
    data = val_targets.drop(drop_cols+[label_col],axis=1),
    label = val_targets[label_col],
    cat_features=cat_cols,
    text_features=text_features
)

test_pool = Pool(
    data = all_targets.drop(drop_cols+['Date','year','month'],axis=1),
    cat_features=cat_cols,
    text_features=text_features
)

In [15]:
cbm = CatBoostRegressor(**cb_params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 17.0061944	test: 21.0394955	best: 21.0394955 (0)	total: 131ms	remaining: 6m 33s
100:	learn: 10.0345758	test: 13.1468855	best: 13.1468855 (100)	total: 4.85s	remaining: 2m 19s
200:	learn: 9.9538566	test: 12.9681367	best: 12.9681367 (200)	total: 9.47s	remaining: 2m 11s
300:	learn: 9.6201601	test: 12.3575980	best: 12.3573586 (295)	total: 13.8s	remaining: 2m 3s
400:	learn: 9.4676141	test: 12.1617067	best: 12.1612112 (397)	total: 18s	remaining: 1m 56s
500:	learn: 9.4469755	test: 12.1636368	best: 12.1612112 (397)	total: 22.4s	remaining: 1m 51s
600:	learn: 9.2851902	test: 11.9650514	best: 11.9648658 (592)	total: 26.6s	remaining: 1m 46s
700:	learn: 9.1127857	test: 11.7882947	best: 11.7880527 (693)	total: 30.8s	remaining: 1m 40s
800:	learn: 8.9887449	test: 11.7053834	best: 11.7050442 (796)	total: 34.8s	remaining: 1m 35s
900:	learn: 8.9011468	test: 11.6214596	best: 11.6214596 (900)	total: 39s	remaining: 1m 30s
1000:	learn: 8.8462843	test: 11.5846417	best: 11.5784135 (982)	total: 43s	rem

<catboost.core.CatBoostRegressor at 0x787d09265bd0>

In [16]:
lgm = LGBMRegressor(n_estimators=70)
lgm.fit(train_targets.drop(drop_cols+[label_col],axis=1), train_targets[label_col])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13012
[LightGBM] [Info] Number of data points in the train set: 108990, number of used features: 53
[LightGBM] [Info] Start training from score 17.336921


In [17]:
mae(val_targets[label_col], lgm.predict(val_targets.drop(drop_cols+[label_col],axis=1)).round())

12.508003779187474

# Train model for each group

Group data by company id and product id then train model for each group

In [18]:
def get_models(data):
    models = {}
    scores = []
    is_const = {}

    for i, group in tqdm(data.groupby(['Company_ID', 'Product_ID'])):
        group = group.sort_values(by='all_date').reset_index(drop=True)

        if len(group) == 1:
            continue

        train_data, val_data = group.iloc[:int(len(group) * 0.99)], group.iloc[int(len(group) * 0.99):]

        X_train = train_data.drop(columns='Target')
        y_train = train_data.Target

        X_val = val_data.drop(columns='Target')
        y_val = val_data.Target

        y = group.Target

        # model = CatBoostRegressor(500, verbose=0, allow_const_label=True)
        model = LGBMRegressor(n_estimators=100, verbose=-1)
        # model = XGBRegressor()
        # model = RandomForestRegressor(50, n_jobs=-1)
        model.fit(X_train, y_train)

        loss = mae(y_val, model.predict(X_val))
        scores.append(loss)

        company = int(group.iloc[0].Company_ID)
        product = int(group.iloc[0].Product_ID)

        models[f'{company}_{product}'] = model

        if len(set(list(y))) == 1:
            is_const[f'{company}_{product}'] = True
        else:
            is_const[f'{company}_{product}'] = False
    
    return models, scores, is_const

In [19]:
models, scores, is_const = get_models(pd.concat([train_targets, val_targets]))

  0%|          | 0/37045 [00:00<?, ?it/s]

In [20]:
np.mean(scores)

10.030022945066811

In [21]:
models['0_1'].predict(all_targets[train_targets.drop(columns='Target').columns])

array([5.6, 5.6, 5.6, ..., 5.6, 5.6, 5.6])

# Submission

In [22]:
test_preds = cbm.predict(test_pool)

In [23]:
def predict(test):
    test = test.reset_index(drop=True)
    preds = []

    for i, row in tqdm(test.iterrows(), total=len(test)):
        data = test.iloc[i:i+1]
        company = row.Company_ID
        product = row.Product_ID
        try:
            pred = models[f"{company}_{product}"].predict(data) ;
            preds += list(pred)
        except:
            preds += [0]
    
    return preds

In [24]:
preds_models = predict(all_targets[train_targets.drop(columns='Target').columns])

  0%|          | 0/113856 [00:00<?, ?it/s]

In [25]:
sample_sub = pd.read_csv('/kaggle/input/ioai-contest-1/sample_submission.csv')

In [26]:
sample_sub['Target'] = np.array(preds_models) * 0.3 + test_preds * 0.7

In [27]:
sample_sub['Target'] = sample_sub['Target'].apply(lambda x: max(0,x)) #.round()

In [28]:
sample_sub['Target'].mean()

15.42732898570691

In [29]:
sample_sub.to_csv('/kaggle/working/submission.csv',index=False)

In [30]:
sample_sub

Unnamed: 0,Id,Target
0,0_1_2024-01,4.332337
1,0_2_2024-01,0.069450
2,0_3_2024-01,0.165609
3,0_5_2024-01,0.849189
4,0_6_2024-01,1.542279
...,...,...
113851,3_14664_2024-03,0.000005
113852,3_14665_2024-03,196.308722
113853,3_14666_2024-03,0.000000
113854,3_14667_2024-03,0.003696
