In [None]:
%%capture
!pip install statsmodels catboost

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.contrib import tzip
from tqdm.auto import tqdm
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from catboost import Pool, cv, CatBoostRegressor
import warnings
tqdm.pandas()
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

ModuleNotFoundError: No module named 'catboost'

In [None]:
train_data = pd.read_csv('/kaggle/input/ioai-contest-1/train.csv')
sample_sub = pd.read_csv('/kaggle/input/ioai-contest-1/sample_submission.csv')

In [4]:
train_data['year'] = train_data['Date'].apply(lambda x: int(x.split('-')[0]) - 2019)
train_data['month'] = train_data['Date'].apply(lambda x: int(x.split('-')[1]) - 1)
train_data['all_date'] = train_data['year'] * 12 + train_data['month']

In [5]:
def prepare_data(sample_submit):
    df = pd.DataFrame()
    df['Company_ID'] = sample_submit['Id'].apply(lambda x: int(x.split('_')[0]) )
    df['Product_ID'] = sample_submit['Id'].apply(lambda x: int(x.split('_')[1]) )
    df['Date'] = sample_submit['Id'].apply(lambda x: x.split('_')[2] )
    df['year'] = df['Date'].apply(lambda x: int(x.split('-')[0]) - 2019)
    df['month'] = df['Date'].apply(lambda x: int(x.split('-')[1]) - 1)
    df['all_date'] = df['year'] * 12 + df['month']
    return df

valid_days = [57,58,59]
train_days = [54,55,56]

val_targets = train_data[train_data['all_date'].isin(valid_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
train_targets = train_data[train_data['all_date'].isin(train_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
all_targets = prepare_data(sample_sub)

In [6]:
dey_to_lags = 20
an_train_days = list(range(min(train_days) - dey_to_lags - 1,min(train_days)))
an_val_days = list(range(min(valid_days) - dey_to_lags - 1,min(valid_days) ))
an_all_days = list(range(60 - dey_to_lags - 1,60))

train_feature_data = train_data[train_data['all_date'].isin(an_train_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
val_feature_data = train_data[train_data['all_date'].isin(an_val_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()
all_feature_data = train_data[train_data['all_date'].isin(an_all_days)].groupby(['Company_ID','Product_ID','all_date'])['Target'].agg('sum').reset_index()

In [7]:
def zero_fill(data,id_maper):
    grp_data = data.groupby(['Company_ID','all_date'])['Product_ID'].apply(list)
    fill_df = []
    for (cmp_id, date_id), products in tzip(grp_data.index,grp_data):
        all_ids = id_maper[cmp_id]
        for i in all_ids:
            if i not in products:
                fill_df.append({
                    'Company_ID': cmp_id,
                    'Product_ID': i,
                    'all_date': date_id,
                    'Target': 0
                })
    fill_df = pd.DataFrame(fill_df)
    data = pd.concat([data,fill_df],axis=0,ignore_index=True).sort_values(by=['all_date','Company_ID','Product_ID'])
    return data

id_maper_train = train_data[train_data['all_date'] < min(train_days)].groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()
id_maper_val = train_data[train_data['all_date'] < min(valid_days)].groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()
id_maper_all = train_data.groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()
#id_maper = train_data.groupby('Company_ID')['Product_ID'].apply(lambda x: list(set(x))).to_dict()


val_targets = zero_fill(val_targets,id_maper_val)
train_targets = zero_fill(train_targets,id_maper_train)
all_targets = zero_fill(all_targets,id_maper_all)

train_feature_data = zero_fill(train_feature_data,id_maper_train)
val_feature_data = zero_fill(val_feature_data,id_maper_val)
all_feature_data = zero_fill(all_feature_data,id_maper_all)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

In [8]:
train_feature_data['all_date'] -= train_feature_data['all_date'].min()
val_feature_data['all_date'] -= val_feature_data['all_date'].min()
all_feature_data['all_date'] -= all_feature_data['all_date'].min()

train_feature_data = train_feature_data.set_index(['Company_ID','Product_ID','all_date'])
val_feature_data = val_feature_data.set_index(['Company_ID','Product_ID','all_date'])
all_feature_data = all_feature_data.set_index(['Company_ID','Product_ID','all_date'])

In [9]:
def get_lags(targets_data, feature_data,lags_range=[0,1,2,3,4,5,6,7,8,9,10],fil_val=-100):
    lags_df = []
    for cmp_id, product_id in tzip(targets_data['Company_ID'],targets_data['Product_ID']):
        try:
            row = feature_data.loc[(cmp_id,product_id)].to_dict()['Target']
            feats = {}
            for i in lags_range:
                if i in row:
                    feats[i] = row[i]
                else:
                    feats[i] = fil_val
        except:
           feats = {} 
           for i in lags_range:
               feats[i] = fil_val
        lags_df.append(feats)
    lags_df = pd.DataFrame(lags_df)
    lags_df.columns = [f'lag_{i}' for i in lags_range]
    return lags_df

def get_fast_lags(targets_data, feature_data,lags_range=[0,1,2,3,4,5,6,7,8,9,10],fil_val=-100):
    feature_data = feature_data.copy().reset_index()
    feature_data = pd.pivot_table(
        feature_data,
        index=['Company_ID','Product_ID'],
        columns=['all_date'],
        values=['Target'],
        aggfunc='first',
        fill_value=fil_val
    )
    feature_data.columns = [f'lag_{i}' for i in lags_range]
    return targets_data.merge(feature_data,on=['Company_ID','Product_ID'])

In [10]:
lags_range = list(range(dey_to_lags+1))
train_targets = get_fast_lags(train_targets,train_feature_data,lags_range=lags_range)
val_targets = get_fast_lags(val_targets,val_feature_data,lags_range=lags_range)
all_targets = get_fast_lags(all_targets,all_feature_data,lags_range=lags_range)

In [11]:
#lags_range = list(range(dey_to_lags+1))
#train_lags = get_lags(train_targets,train_feature_data,lags_range=lags_range)
#val_lags = get_lags(val_targets,val_feature_data,lags_range=lags_range)
#all_lags = get_lags(all_targets,all_feature_data,lags_range=lags_range)

#train_targets = pd.concat([train_targets,train_lags],axis=1)
#val_targets = pd.concat([val_targets,val_lags],axis=1)
#all_targets = pd.concat([all_targets,all_lags],axis=1)

In [12]:
def get_arima_features(row):
    series = [row[f'lag_{i}'] for i in lags_range]
    model = ARIMA(series, order=(1, 1, 1)).fit()
    return model.forecast(steps=3).tolist()

In [13]:
train_targets['all_date'] -= train_targets['all_date'].min()
val_targets['all_date'] -= val_targets['all_date'].min()
all_targets['all_date'] -= all_targets['all_date'].min()

In [14]:
cb_params = {
    'iterations': 3000,
    'learning_rate': 0.05,
    'loss_function': 'MAE',
    'max_depth': 5,
    'eval_metric': 'MAE',
    'use_best_model':False,
    'task_type': 'CPU',
    'random_seed': 56,
}

cat_cols = ['Company_ID','all_date']
label_col = 'Target'
drop_cols = [
    'Product_ID'
]
text_features = None

In [15]:
train_pool = Pool(
    data = train_targets.drop(drop_cols+[label_col],axis=1),
    label = train_targets[label_col],
    cat_features=cat_cols,
    text_features=text_features
)

eval_pool = Pool(
    data = val_targets.drop(drop_cols+[label_col],axis=1),
    label = val_targets[label_col],
    cat_features=cat_cols,
    text_features=text_features
)

test_pool = Pool(
    data = all_targets.drop(drop_cols+['Date','year','month'],axis=1),
    cat_features=cat_cols,
    text_features=text_features
)

In [16]:
cbm = CatBoostRegressor(**cb_params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 17.0310201	test: 21.0167443	best: 21.0167443 (0)	total: 150ms	remaining: 7m 30s
100:	learn: 9.9918424	test: 13.1575528	best: 13.1575528 (100)	total: 6.16s	remaining: 2m 56s
200:	learn: 9.8168391	test: 13.0159398	best: 13.0158690 (199)	total: 11.5s	remaining: 2m 39s
300:	learn: 9.6267705	test: 12.7725658	best: 12.7723999 (287)	total: 16.7s	remaining: 2m 29s
400:	learn: 9.6195028	test: 12.7649737	best: 12.7614176 (308)	total: 21.3s	remaining: 2m 17s
500:	learn: 9.6162614	test: 12.7702394	best: 12.7614176 (308)	total: 25.8s	remaining: 2m 8s
600:	learn: 9.6120616	test: 12.7707831	best: 12.7614176 (308)	total: 30.8s	remaining: 2m 2s
700:	learn: 9.5656028	test: 12.7547067	best: 12.7544456 (690)	total: 35.5s	remaining: 1m 56s
800:	learn: 9.4418488	test: 12.6525209	best: 12.6525209 (800)	total: 40.2s	remaining: 1m 50s
900:	learn: 9.3017931	test: 12.4319966	best: 12.4309291 (899)	total: 46.1s	remaining: 1m 47s
1000:	learn: 9.2330691	test: 12.2830306	best: 12.2819621 (963)	total: 50.8s

<catboost.core.CatBoostRegressor at 0x7bb67edca500>

In [17]:
(val_targets['lag_20'] - val_targets['Target']).abs().mean()

15.176380078283168

In [18]:
cbm.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,lag_13,11.993342
1,lag_15,11.014269
2,Company_ID,10.035007
3,lag_5,9.305289
4,lag_10,6.717957
5,lag_3,5.885249
6,lag_20,5.432552
7,lag_14,4.234239
8,lag_9,4.100942
9,lag_17,3.812756


In [19]:
test_preds = cbm.predict(test_pool)

In [20]:
sample_sub = pd.read_csv('/kaggle/input/ioai-contest-1/sample_submission.csv')

In [21]:
sample_sub['Target'] = test_preds

In [23]:
sample_sub['Target'] = sample_sub['Target'].apply(lambda x: max(0,x))

In [24]:
sample_sub['Target'].mean()

12.415489610424368

In [26]:
sample_sub.to_csv('catboost_modelingV7.csv',index=False)

In [25]:
sample_sub

Unnamed: 0,Id,Target
0,0_1_2024-01,1.777984
1,0_2_2024-01,0.052679
2,0_3_2024-01,0.003176
3,0_5_2024-01,0.699381
4,0_6_2024-01,1.004644
...,...,...
113851,3_14664_2024-03,0.000004
113852,3_14665_2024-03,169.199698
113853,3_14666_2024-03,0.000000
113854,3_14667_2024-03,0.004533


In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('/kaggle/input/ioai-contest-1/train.csv')
sample_submission = pd.read_csv('/kaggle/input/ioai-contest-1/sample_submission.csv')
sample_submission['Company_ID'] = sample_submission['Id'].apply(lambda x:x.split('_')[0]).astype('int')
sample_submission['Product_ID'] = sample_submission['Id'].apply(lambda x:x.split('_')[1]).astype('int')

predict = train[train['Date'] >= '2023-12-01'].groupby(['Company_ID', 'Product_ID'])['Target'].sum().reset_index()
submission = sample_submission.merge(predict, on = ['Company_ID', 'Product_ID'], how = 'left')
submission['Target_y'] = submission['Target_y'].fillna(1.0)

submission = submission[['Id', 'Target_y']]
submission.columns = ['Id', 'Target']
submission.to_csv('submission_ioai_fillna_1.csv', index = None)