In [58]:
import pandas as pd
import holidays
import numpy as np
import requests
from bs4 import BeautifulSoup
from catboost import *
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm.auto import tqdm
import pickle

In [59]:
terms = pd.read_excel('terminal_data_hackathon v4.xlsx')
terms

Unnamed: 0,TID,longitude,latitude
0,692835,37.646257,55.742062
1,698656,37.666136,55.731231
2,686168,37.675027,55.727031
3,679671,37.669706,55.706824
4,682180,37.596399,55.711151
...,...,...,...
1625,607749,37.138157,56.757843
1626,699046,35.647877,56.226824
1627,637512,35.515147,56.032402
1628,680602,36.543716,55.380772


In [60]:
data = pd.read_excel('terminal_data_hackathon v4.xlsx', 'Incomes')
data.head()

Unnamed: 0,TID,остаток на 31.08.2022 (входящий),2022-09-01 00:00:00,2022-09-02 00:00:00,2022-09-03 00:00:00,2022-09-04 00:00:00,2022-09-05 00:00:00,2022-09-06 00:00:00,2022-09-07 00:00:00,2022-09-08 00:00:00,...,2022-11-21 00:00:00,2022-11-22 00:00:00,2022-11-23 00:00:00,2022-11-24 00:00:00,2022-11-25 00:00:00,2022-11-26 00:00:00,2022-11-27 00:00:00,2022-11-28 00:00:00,2022-11-29 00:00:00,2022-11-30 00:00:00
0,406136,160000,90000,105000,99000,107000,110000,60000,75000,89000,...,91000,78000,0,165000,0,189000,106000,94000,75000,74000
1,406139,387000,103000,206000,168000,124000,78000,165000,164000,174000,...,164000,153000,151000,157000,206000,182000,123000,138000,112000,179000
2,406145,287000,143000,136000,124000,117000,123000,140000,139000,138000,...,119000,100000,179000,169000,118000,118000,114000,128000,121000,124000
3,406148,355000,50000,73000,53000,65000,75000,100000,53000,52000,...,48000,55000,65000,85000,95000,68000,62000,0,118000,70000
4,406180,597000,96000,82000,71000,72000,86000,55000,55000,75000,...,82000,56000,70000,59000,105000,70000,77000,87000,59000,55000


In [61]:
df_unpivot = pd.melt(data, id_vars='TID', value_vars=data.columns[2:])
data = df_unpivot.sort_values(by=['TID', 'variable'])
data = data.rename(columns={'TID': 'tid', 'variable': 'date', 'value': 'income'})
data['date'] = pd.to_datetime(data['date'])
data.head()

Unnamed: 0,tid,date,income
0,406136,2022-09-01,90000
1630,406136,2022-09-02,105000
3260,406136,2022-09-03,99000
4890,406136,2022-09-04,107000
6520,406136,2022-09-05,110000


In [62]:
data['train_or_test'] = np.where(data['date'] >= '2022-11-01', 'val', 'train')
data['train_or_test'].value_counts()

train    99430
val      48900
Name: train_or_test, dtype: int64

In [63]:
ru_holidays = holidays.RU()
data['is_holiday'] = data['date'].apply(lambda x: x in ru_holidays)
data['is_holiday'].sum()

1630

In [64]:
data['dayofmonth'] = data.date.dt.day
# df['dayofyear'] = df.date.dt.dayofyear
data['dayofweek'] = data.date.dt.dayofweek
data['month'] = data.date.dt.month
# data['year'] = data.date.dt.year
# df['weekofyear'] = df.date.dt.weekofyear
data['is_month_start'] = (data.date.dt.is_month_start).astype(int)
data['is_month_end'] = (data.date.dt.is_month_end).astype(int)
data.head()

Unnamed: 0,tid,date,income,train_or_test,is_holiday,dayofmonth,dayofweek,month,is_month_start,is_month_end
0,406136,2022-09-01,90000,train,False,1,3,9,1,0
1630,406136,2022-09-02,105000,train,False,2,4,9,0,0
3260,406136,2022-09-03,99000,train,False,3,5,9,0,0
4890,406136,2022-09-04,107000,train,False,4,6,9,0,0
6520,406136,2022-09-05,110000,train,False,5,0,9,0,0


In [65]:
# Creating sales lag features
def create_sales_lag_feats(df, gpby_cols, target_col, lags):
    gpby = df.groupby(gpby_cols)
    for i in lags:
        df['_'.join([target_col, 'lag', str(i)])] = \
                gpby[target_col].shift(i).values + np.random.normal(scale=1, size=(len(df),)) * 0
    return df

# Creating sales rolling mean features
def create_sales_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                             shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmean', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).mean().values +\
            np.random.normal(scale=1, size=(len(df),)) * 0
    return df

# Creating sales rolling median features
def create_sales_rmed_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                            shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmed', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).median().values +\
            np.random.normal(scale=1, size=(len(df),)) * 0
    return df

# Creating sales exponentially weighted mean features
def create_sales_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
    gpby = df.groupby(gpby_cols)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                gpby[target_col].shift(s).ewm(alpha=a).mean().values
    return df

In [85]:
# Converting sales of validation period to nan so as to resemble test period
train = data.copy()
y_val = train.loc[train.train_or_test=='val', 'income'].values.reshape((-1))
y_train = train.loc[train.train_or_test=='train', 'income'].values.reshape((-1))
train.loc[train.train_or_test=='val', 'income'] = np.nan

# # Creating sales lag, rolling mean, rolling median, ohe features of the above train set
train = create_sales_lag_feats(train, gpby_cols=['tid'], target_col='income', 
                               lags=[1, 7, 14, 28])

train = create_sales_rmean_feats(train, gpby_cols=['tid'], 
                                 target_col='income', windows=[1, 3, 7, 14, 28], 
                                 min_periods=1, win_type='triang')

train = create_sales_rmed_feats(train, gpby_cols=['tid'], 
                                 target_col='income', windows=[2, 3, 7, 14, 28], 
                                 min_periods=2, win_type=None)

train = create_sales_ewm_feats(train, gpby_cols=['tid'], 
                               target_col='income', 
                               alpha=[0.9, 0.7, 0.6], 
                               shift=[3, 7, 14, 28])
train['target'] = (train['income']==0)
train = create_sales_lag_feats(train, gpby_cols=['tid'], target_col='target', 
                               lags=[1, 7, 14, 28])

train = create_sales_rmean_feats(train, gpby_cols=['tid'], 
                                 target_col='target', windows=[1, 3, 7, 14, 28], 
                                 min_periods=1, win_type='triang')

train = create_sales_rmed_feats(train, gpby_cols=['tid'], 
                                 target_col='target', windows=[2, 3, 7, 14, 28], 
                                 min_periods=2, win_type=None)

train = create_sales_ewm_feats(train, gpby_cols=['tid'], 
                               target_col='target', 
                               alpha=[0.9, 0.7, 0.6], 
                               shift=[3, 7, 14, 28])


tid_mean = train[train.train_or_test=='train'].groupby('tid')['income'].mean().reset_index().rename(columns={'income': 'tid_mean_income'})
train = train.merge(tid_mean, on='tid', how='left')

# One-Hot Encoding 
# train = one_hot_encoder(train, ohe_cols=['store', 'item', 'month']) 
#,'dayofmonth','weekofyear'

# Final train and val datasets

In [86]:
with open("tid_mean.pkl", 'wb') as f:
    pickle.dump(tid_mean, f)

In [87]:
url = 'http://weatherarchive.ru/Temperature/Moscow/{month}-{year}'
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
          'August', 'September', 'October', 'November', 'December']

years = [2022]

In [88]:
url.format(month=months[0], year=2022)

'http://weatherarchive.ru/Temperature/Moscow/January-2022'

In [89]:
def parse_table(table):
    res = {'temp': [],
           'wet': [],
           'p': [],
           'wind': []}
    
    tags = table.findAll('td')
    k = 0
    for tag in tags:
        if tag.find('a') is not None:
            continue
            
        if k == 0:
            k += 1
            res['temp'].append(float(tag.text.replace('°C', '').replace('+','').replace('−','-')))
        elif k == 1:
            k += 1
            res['wet'].append(float(tag.text.replace('%','')))
        elif k == 2:
            k += 1
            res['p'].append(int(tag.text))
        else:
            k = 0
            res['wind'].append(int(tag.text.replace(' м/с', '')))
    return res

def parse_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    tables = soup.findAll('table', class_='smart')
    for table in tables:
        if 'Среднесуточная' in str(table):
            return parse_table(table)

In [90]:
pd.DataFrame(parse_url(url.format(month=months[0], year=years[0]))).head()

Unnamed: 0,temp,wet,p,wind
0,-3.9,92.25,737,3
1,-7.8,82.25,744,4
2,-9.44,87.25,739,3
3,-9.28,87.75,739,2
4,-7.4,90.13,734,4


In [91]:
stats = {}
for year in years:
    stats[year] = {}
    for month in tqdm(months):
        stats[year][month] = parse_url(url.format(month=month, year=year))

weather = []
for i, (month, v) in enumerate(stats[2022].items()):
    i = i + 1
    for j, (temp, wet, p, wind) in enumerate(zip(v['temp'], v['wet'], v['p'], v['wind'])):
        j = j + 1
        si = '0' + str(i) if i < 10 else str(i)
        sj = '0' + str(j) if j < 10 else str(j)
        
        weather.append({'date': '2022-{}-{}'.format(si, sj),
                        'temp': temp,
                        'wet': wet,
                        'p': p,
                        'wind': wind})
weather = pd.DataFrame(weather)
weather['date'] = pd.to_datetime(weather['date'])
weather.head()

  0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,date,temp,wet,p,wind
0,2022-01-01,-3.9,92.25,737,3
1,2022-01-02,-7.8,82.25,744,4
2,2022-01-03,-9.44,87.25,739,3
3,2022-01-04,-9.28,87.75,739,2
4,2022-01-05,-7.4,90.13,734,4


In [92]:
train = train.merge(weather, on='date', how='left')
train.head()

Unnamed: 0,tid,date,income,train_or_test,is_holiday,dayofmonth,dayofweek,month,is_month_start,is_month_end,...,target_lag_28_ewm_0.7,target_lag_3_ewm_0.6,target_lag_7_ewm_0.6,target_lag_14_ewm_0.6,target_lag_28_ewm_0.6,tid_mean_income,temp,wet,p,wind
0,406136,2022-09-01,90000.0,train,False,1,3,9,1,0,...,,,,,,93770.491803,9.58,68.13,744,4
1,406136,2022-09-02,105000.0,train,False,2,4,9,0,0,...,,,,,,93770.491803,7.59,71.75,748,4
2,406136,2022-09-03,99000.0,train,False,3,5,9,0,0,...,,,,,,93770.491803,7.19,67.5,753,3
3,406136,2022-09-04,107000.0,train,False,4,6,9,0,0,...,,0.0,,,,93770.491803,7.91,65.5,753,2
4,406136,2022-09-05,110000.0,train,False,5,0,9,0,0,...,,0.0,,,,93770.491803,6.41,82.38,752,3


In [93]:
X_val = train.loc[train.train_or_test=='val', :].drop(columns=['train_or_test', 'income', 'target'])
X_train = train.loc[train.train_or_test=='train', :].drop(columns=['train_or_test', 'income', 'target'])
print('Train shape:{}, Val shape:{}'.format(X_train.shape, X_val.shape))

Train shape:(99430, 65), Val shape:(48900, 65)


In [94]:
def smape(preds, target):
    '''
    Function to calculate SMAPE
    '''
    n = len(preds)
    masked_arr = ~((preds==0)&(target==0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds)+np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

In [95]:
cat_cols = ['tid']

params_cat = {'n_estimators' : 2000,
          'learning_rate': .02,
          'depth' : 6,
          'use_best_model': True,
          'cat_features' : cat_cols,
          # 'text_features': text_cols,
          # 'train_dir' : '/home/jovyan/work/catboost',
          # 'border_count' : 64,
          # 'l2_leaf_reg' : 1,
          # 'bagging_temperature' : 2,
          # 'rsm' : .1,
          # 'loss_function': 'MultiClass',
          # 'auto_class_weights' : 'Balanced', #try not balanced
          'random_state': 42,
          'eval_metric' : 'MAE'
         }

cat_model = CatBoostRegressor(**params_cat)

In [98]:
cat_model.fit(X_train, y_train, verbose=100, eval_set=(X_val, y_val), early_stopping_rounds=200, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 29190.8999941	test: 29452.5806813	best: 29452.5806813 (0)	total: 32.8ms	remaining: 32.8s
100:	learn: 12260.3142881	test: 14144.0951022	best: 14144.0951022 (100)	total: 3.56s	remaining: 31.7s
200:	learn: 11564.3852471	test: 14840.6180291	best: 14144.0951022 (100)	total: 7.05s	remaining: 28s
300:	learn: 11399.4969471	test: 15057.8464329	best: 14144.0951022 (100)	total: 10.4s	remaining: 24.2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 14144.0951
bestIteration = 100

Shrink model to first 101 iterations.


<catboost.core.CatBoostRegressor at 0x26d68c35400>

In [97]:
pred = cat_model.predict(X_val)
mean_absolute_error(y_val, pred), mean_squared_error(y_val, pred), smape(pred, y_val)

(14144.095103191252, 773301192.3743773, 33.951254058779774)

In [37]:
# mean prediction
tid_mean = train[train.train_or_test=='train'].groupby('tid')['income'].mean().reset_index()
mean_pred = X_val.merge(tid_mean, on='tid', how='left')['income']
mean_absolute_error(y_val, mean_pred), mean_squared_error(y_val, mean_pred), smape(mean_pred, y_val)

(13120.229307050186, 686655385.1269174, 31.177421272732385)

In [38]:
# cross val
dates = X_val['date'].unique()
preds = np.zeros(len(X_val))
for i in tqdm(range(len(dates))):
    train_mask = X_val['date'] < dates[i]
    pred_mask = X_val['date'] == dates[i]
    cat_model = CatBoostRegressor(**params_cat)
    cat_model.fit(pd.concat([X_train, X_val[train_mask]]),
                  np.concatenate([y_train, y_val[train_mask]]),
                  verbose=False,
                  eval_set=(X_val[~train_mask], y_val[~train_mask]), early_stopping_rounds=200)
    preds[pred_mask] = cat_model.predict(X_val[pred_mask])

  0%|          | 0/30 [00:00<?, ?it/s]

In [39]:
mean_absolute_error(y_val, preds), mean_squared_error(y_val, preds), smape(preds, y_val)

(13374.486875604181, 689079132.4015058, 32.33597192526486)

### Zero preds

In [149]:
terms = pd.read_excel('terminal_data_hackathon v4.xlsx')
data = pd.read_excel('terminal_data_hackathon v4.xlsx', 'Incomes')
df_unpivot = pd.melt(data, id_vars='TID', value_vars=data.columns[2:])
data = df_unpivot.sort_values(by=['TID', 'variable'])
data = data.rename(columns={'TID': 'tid', 'variable': 'date', 'value': 'income'})
data['date'] = pd.to_datetime(data['date'])
data['train_or_test'] = np.where(data['date'] >= '2022-11-01', 'val', 'train')
ru_holidays = holidays.RU()
data['is_holiday'] = data['date'].apply(lambda x: x in ru_holidays)
data['dayofmonth'] = data.date.dt.day
data['dayofweek'] = data.date.dt.dayofweek
data['month'] = data.date.dt.month
data['is_month_start'] = (data.date.dt.is_month_start).astype(int)
data['is_month_end'] = (data.date.dt.is_month_end).astype(int)
data['target'] = (data['income']==0)
url = 'http://weatherarchive.ru/Temperature/Moscow/{month}-{year}'
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
          'August', 'September', 'October', 'November', 'December']

years = [2022]

stats = {}
for year in years:
    stats[year] = {}
    for month in tqdm(months):
        stats[year][month] = parse_url(url.format(month=month, year=year))

weather = []
for i, (month, v) in enumerate(stats[2022].items()):
    i = i + 1
    for j, (temp, wet, p, wind) in enumerate(zip(v['temp'], v['wet'], v['p'], v['wind'])):
        j = j + 1
        si = '0' + str(i) if i < 10 else str(i)
        sj = '0' + str(j) if j < 10 else str(j)
        
        weather.append({'date': '2022-{}-{}'.format(si, sj),
                        'temp': temp,
                        'wet': wet,
                        'p': p,
                        'wind': wind})
        
train = data.copy()
y_val = train.loc[train.train_or_test=='val', 'target'].values.reshape((-1))
y_val_incomes = train.loc[train.train_or_test=='val', 'income'].values.reshape((-1))
y_train = train.loc[train.train_or_test=='train', 'target'].values.reshape((-1))
train.loc[train.train_or_test=='val', 'income'] = np.nan

train = create_sales_lag_feats(train, gpby_cols=['tid'], target_col='target', 
                               lags=[1, 7, 14, 28])

train = create_sales_rmean_feats(train, gpby_cols=['tid'], 
                                 target_col='target', windows=[1, 3, 7, 14, 28], 
                                 min_periods=1, win_type='triang')

train = create_sales_rmed_feats(train, gpby_cols=['tid'], 
                                 target_col='target', windows=[2, 3, 7, 14, 28], 
                                 min_periods=2, win_type=None)

train = create_sales_ewm_feats(train, gpby_cols=['tid'], 
                               target_col='target', 
                               alpha=[0.9, 0.7, 0.6], 
                               shift=[3, 7, 14, 28])

train = create_sales_lag_feats(train, gpby_cols=['tid'], target_col='income', 
                               lags=[1, 7, 14, 28])

train = create_sales_rmean_feats(train, gpby_cols=['tid'], 
                                 target_col='income', windows=[1, 3, 7, 14, 28], 
                                 min_periods=1, win_type='triang')

train = create_sales_rmed_feats(train, gpby_cols=['tid'], 
                                 target_col='income', windows=[2, 3, 7, 14, 28], 
                                 min_periods=2, win_type=None)

train = create_sales_ewm_feats(train, gpby_cols=['tid'], 
                               target_col='income', 
                               alpha=[0.9, 0.7, 0.6], 
                               shift=[3, 7, 14, 28])


weather = pd.DataFrame(weather)
weather['date'] = pd.to_datetime(weather['date'])
train = train.merge(weather, on='date', how='left')
X_val = train.loc[train.train_or_test=='val', :].drop(columns=['train_or_test', 'income', 'target'])
X_train = train.loc[train.train_or_test=='train', :].drop(columns=['train_or_test', 'income', 'target'])
X_train.head()

  0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,tid,date,is_holiday,dayofmonth,dayofweek,month,is_month_start,is_month_end,target_lag_1,target_lag_7,...,income_lag_14_ewm_0.7,income_lag_28_ewm_0.7,income_lag_3_ewm_0.6,income_lag_7_ewm_0.6,income_lag_14_ewm_0.6,income_lag_28_ewm_0.6,temp,wet,p,wind
0,406136,2022-09-01,False,1,3,9,1,0,,,...,,,,,,,9.58,68.13,744,4
1,406136,2022-09-02,False,2,4,9,0,0,0.0,,...,,,,,,,7.59,71.75,748,4
2,406136,2022-09-03,False,3,5,9,0,0,0.0,,...,,,,,,,7.19,67.5,753,3
3,406136,2022-09-04,False,4,6,9,0,0,0.0,,...,,,90000.0,,,,7.91,65.5,753,2
4,406136,2022-09-05,False,5,0,9,0,0,0.0,,...,,,100714.285714,,,,6.41,82.38,752,3


In [150]:
cat_cols = ['tid']

params_cat = {'n_estimators' : 1000,
          'learning_rate': .03,
          'depth' : 4,
          'use_best_model': True,
          'cat_features' : cat_cols,
          # 'text_features': text_cols,
          # 'train_dir' : '/home/jovyan/work/catboost',
          # 'border_count' : 64,
          # 'l2_leaf_reg' : 1,
          # 'bagging_temperature' : 2,
          # 'rsm' : .1,
          # 'loss_function': 'MultiClass',
        #'auto_class_weights' : 'Balanced', #try not balanced
          'random_state': 42,
          'eval_metric' : 'AUC'
         }

cat_model = CatBoostClassifier(**params_cat)
cat_model.fit(X_train, y_train, verbose=100, eval_set=(X_val, y_val), early_stopping_rounds=200, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.8392137	best: 0.8392137 (0)	total: 87.5ms	remaining: 1m 27s
100:	test: 0.8909328	best: 0.8910930 (93)	total: 9.18s	remaining: 1m 21s
200:	test: 0.8913483	best: 0.8921291 (182)	total: 18.1s	remaining: 1m 12s
300:	test: 0.8924956	best: 0.8929350 (254)	total: 27s	remaining: 1m 2s
400:	test: 0.8905244	best: 0.8929350 (254)	total: 37.4s	remaining: 55.9s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8929349981
bestIteration = 254

Shrink model to first 255 iterations.


<catboost.core.CatBoostClassifier at 0x26d02504c10>

In [151]:
# find best threshold for predictions

tid_mean = train[train.train_or_test=='train'][train.income>0].groupby('tid')['income'].mean().reset_index()

with open('tid_mean.pkl', 'wb') as f:
    pickle.dump(tid_mean, f)
mean_pred = X_val.merge(tid_mean, on='tid', how='left')['income']
bst = 40
bst_thr = 0
for thr in tqdm(range(1000)):
    cur = thr/1000
    y_val_preds = cat_model.predict_proba(X_val)[:, 1]>cur
    mean_pred = X_val.merge(tid_mean, on='tid', how='left')['income']
    mean_pred[y_val_preds] = 0
    if smape(mean_pred, y_val_incomes) < bst:
        bst = smape(mean_pred, y_val_incomes)
        bst_thr = cur
mean_pred = X_val.merge(tid_mean, on='tid', how='left')['income']
y_val_preds = cat_model.predict_proba(X_val)[:, 1]>bst_thr
mean_pred[y_val_preds] = 0
print(bst_thr)
mean_absolute_error(y_val_incomes, mean_pred), mean_squared_error(y_val_incomes, mean_pred), smape(mean_pred, y_val_incomes)

  tid_mean = train[train.train_or_test=='train'][train.income>0].groupby('tid')['income'].mean().reset_index()


  0%|          | 0/1000 [00:00<?, ?it/s]

0.381


(12570.518976866993, 655470418.5027754, 27.845278479156192)

In [152]:
from sklearn.metrics import *
print("ACCURACY", accuracy_score(y_val, cat_model.predict(X_val)=='True'))
print("PRECISION", precision_score(y_val, cat_model.predict(X_val)=='True'))
print("RECALL", recall_score(y_val, cat_model.predict(X_val)=='True'))

ACCURACY 0.9526584867075665
PRECISION 0.7474654377880184
RECALL 0.28436185133239833


In [153]:
from catboost import Pool

fi = cat_model.get_feature_importance(Pool(X_val, y_val, cat_features=cat_cols),
                                      prettified=True)
msk = [i for i in range(len(fi)) if fi['Feature Id'].iloc[i] in ['temp', 'wet', 'p', 'wind']]
fi.iloc[msk]

Unnamed: 0,Feature Id,Importances
42,temp,0.231519
53,wet,0.018726
58,p,0.001574
63,wind,0.0


In [154]:
with open('catboost_zero.pkl', 'wb') as f:
    pickle.dump(cat_model, f)

In [None]:
# check inference script
!python predict.py

In [None]:
pd.read_csv('res.csv')