In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **IMPORTS**

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np

# **DATA**

In [3]:
sample_submission = pd.read_csv('/content/drive/MyDrive/SberMarketdata/sample_submission.csv') # submission
sample_submission.head(-1)

Unnamed: 0,id,target
0,0;133,0
1,0;5,1
2,0;10,0
3,0;396,1
4,0;14,0
...,...,...
790443,19998;57,0
790444,19998;26,0
790445,19998;31,0
790446,19998;29,1


In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/SberMarketdata/train.csv') # train_data
print(train_data.shape)
train_data.head(-1)


(3123064, 3)


Unnamed: 0,user_id,order_completed_at,cart
0,2,2015-03-22 09:25:46,399
1,2,2015-03-22 09:25:46,14
2,2,2015-03-22 09:25:46,198
3,2,2015-03-22 09:25:46,88
4,2,2015-03-22 09:25:46,157
...,...,...,...
3123058,12702,2020-09-03 23:45:45,445
3123059,12702,2020-09-03 23:45:45,441
3123060,12702,2020-09-03 23:45:45,92
3123061,12702,2020-09-03 23:45:45,431


**Make data ready for model**

In [5]:
# make sparse matrix for categories
%%time
cart_matrix = pd.get_dummies(train_data, columns = ['cart'], prefix='', prefix_sep='', dtype='bool')
cart_matrix = cart_matrix.groupby(['user_id', 'order_completed_at']).any().reset_index()
# %%time
# # Creating a sparse matrix by category using pd.pivot_table
# cart_matrix = pd.pivot_table(train_data, index=['user_id', 'order_completed_at'], columns='cart', aggfunc='size', fill_value=0)
# cart_matrix = cart_matrix.reset_index()
# cart_matrix.head(3)

CPU times: user 57.8 s, sys: 4.75 s, total: 1min 2s
Wall time: 1min 3s


In [6]:
# Per-User Order Count: Adds a new order_number column that specifies the order number for each user.
cart_matrix['ordered'] = cart_matrix.groupby(['user_id']).cumcount()
cart_matrix.head(3)

Unnamed: 0,user_id,order_completed_at,0,1,2,3,4,5,6,7,...,872,873,874,875,876,877,878,879,880,ordered
0,0,2020-07-19 09:59:17,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1,0,2020-08-24 08:55:32,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1
2,0,2020-09-02 07:38:25,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2


In [7]:
#delete order_completed_at
cart_matrix = cart_matrix.drop('order_completed_at', axis=1)

In [8]:
last_order = cart_matrix.groupby(['user_id'])['ordered'].transform(max) == cart_matrix['ordered']
train = cart_matrix[~last_order].groupby('user_id').sum().reset_index() #A training set, train, is created, in which rows corresponding to the last orders of users are removed (~last_order inverts the Boolean mask, selecting rows that are not the last orders)
valid = cart_matrix[last_order].reset_index(drop=True) #A validating set, which including only rows corresponding to users' most recent orders.
last_order.head(3)

0    False
1    False
2     True
Name: ordered, dtype: bool

In [9]:
train_melt = pd.melt(train, id_vars=['user_id'], var_name='category', value_name='ordered')
valid_melt = pd.melt(valid, id_vars=['user_id'], var_name='category', value_name='target')

  train_melt = pd.melt(train, id_vars=['user_id'], var_name='category', value_name='ordered')


In [10]:
Train = train_melt.copy()
Train.head(3) #trainig set

Unnamed: 0,user_id,category,ordered
0,0,0,0
1,1,0,0
2,2,0,1


In [11]:
valid_melt.head(3) # validating set

Unnamed: 0,user_id,category,target
0,0,0,False
1,1,0,False
2,2,0,False


In [12]:
# user_id / category as in submission file
Train['id'] = Train['user_id'].astype(str) + ';' + Train['category']

# target variable (the last known purchase)
Train['target'] = valid_melt['target'].astype(int)

In [13]:
Train.head(3)

Unnamed: 0,user_id,category,ordered,id,target
0,0,0,0,0;0,0
1,1,0,0,1;0,0
2,2,0,1,2;0,0


In [14]:
order_number = valid[['user_id', 'ordered']].set_index('user_id')['ordered'].squeeze()
Train['total_orders'] = Train['user_id'].map(order_number)
Train.head(3)

Unnamed: 0,user_id,category,ordered,id,target,total_orders
0,0,0,0,0;0,0,2
1,1,0,0,1;0,0,8
2,2,0,1,2;0,0,14


In [15]:
Train['rating'] = Train['ordered']/Train['total_orders']
Train.head(3)

Unnamed: 0,user_id,category,ordered,id,target,total_orders,rating
0,0,0,0,0;0,0,2,0.0
1,1,0,0,1;0,0,8,0.0
2,2,0,1,2;0,0,14,0.071429


In [16]:
#remove those users/categories who are not represented in the submission file
Train = Train[Train.id.isin(sample_submission.id.unique())].reset_index(drop=True)
#Check
print((Train.sort_values('id')['id'].values == Train.sort_values('id')['id'].values).all())

True


In [17]:
#purchase counter by all user (for represetned users)
total_ordered = Train.groupby('category')['ordered'].sum()
Train['total_ordered'] = Train['category'].map(total_ordered)
print(Train.head(3))

   user_id category  ordered   id  target  total_orders    rating  \
0        7        0        0  7;0       1            10  0.000000   
1        8        0        1  8;0       0             7  0.142857   
2        9        0        1  9;0       0            45  0.022222   

   total_ordered  
0          12922  
1          12922  
2          12922  


In [18]:
print(f'Train.shape: {Train.shape} \n {Train.head(3)} \n')
print(f'valid_melt.shape: {valid_melt.shape} \n {valid_melt.head(3)} \n')

Train.shape: (790449, 8) 
    user_id category  ordered   id  target  total_orders    rating  \
0        7        0        0  7;0       1            10  0.000000   
1        8        0        1  8;0       0             7  0.142857   
2        9        0        1  9;0       0            45  0.022222   

   total_ordered  
0          12922  
1          12922  
2          12922   

valid_melt.shape: (17640000, 3) 
    user_id category target
0        0        0  False
1        1        0  False
2        2        0  False 



# **Train and Valid sets**

In [19]:
# X = Train.drop(['target', 'user_id', 'category'], axis=1)
# X = pd.get_dummies(X, columns=['id'])
# y = Train.target
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=None)

Train_set, Valid_set = train_test_split(Train, test_size=0.2, random_state=42, stratify=None) # split Train for train_set and valid_set


# **AUTOML**

we have big dataset and task of binary classification. I think is better decision use automl

In [20]:
%pip install -U lightautoml



In [21]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [22]:
Valid_set

Unnamed: 0,user_id,category,ordered,id,target,total_orders,rating,total_ordered
349792,9954,160,1,9954;160,0,9,0.111111,2804
239817,851,84,1,851;84,0,6,0.166667,48247
49003,17736,15,1,17736;15,0,8,0.125000,18226
535727,13596,392,1,13596;392,0,7,0.142857,11228
776699,10856,812,1,10856;812,0,6,0.166667,7726
...,...,...,...,...,...,...,...,...
335425,15758,149,1,15758;149,0,8,0.125000,5078
415053,10168,228,1,10168;228,0,5,0.200000,457
295712,5047,100,1,5047;100,0,8,0.125000,16889
741069,477,798,2,477;798,0,7,0.285714,15579


In [23]:
%%time
def f1 (real, pred, **kwargs):
    return f1_score(real, (pred > 0.5).astype(int), **kwargs)

roles = {'target': 'target', 'drop': ['user_id', 'category', 'id']}
task = Task('binary', metric = f1)

automl = TabularAutoML(task = task,
                       timeout = 300,
                       cpu_limit = 4,
                       reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 17},
                       general_params = {'use_algos': [['linear_l2']]},
                      )

train_pred = automl.fit_predict(Train_set, roles = roles)
print('Score', "%.5f" % f1(Train_set.target, train_pred.data))

valid_pred = automl.predict(Valid_set)
print('Score on out of folds validation', "%.5f" % f1(Valid_set.target, valid_pred.data))

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: binary

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 300.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (632359, 8)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 286.63 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [], 'embed_sizes': (), 'data_size': 6}
INFO2:lightautoml.ml

Score 0.57575
Score on out of folds validation 0.57544
CPU times: user 34.6 s, sys: 679 ms, total: 35.3 s
Wall time: 46.3 s


Let's fit model

In [24]:
# X_train = train_set.drop(['target', 'id', 'total_ordered'], axis=1)
# y_train = train_set['target']
# print(f'X_train: \n {X_train.head(3)} \n')
# print(f'y_train: \n {y_train.head(3)} \n')
# print(f'X_train.shape: {X_train.shape} \n train_set.shape: {train_set.shape}')

In [24]:
best_score = 0
for i in np.arange(0.01, 1.0, 0.01):
    score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
    if score > best_score:
        best_score = score
        proba_split = i

print('At i =', "%.2f" % proba_split,'score is : ' "%.5f" % best_score)

At i = 0.29 score is : 0.62031


In [25]:
Test = Train.copy() #copy Train in Test

#increment counter
Test['total_orders'] += 1

#add last purchase
Test['ordered'] = Test['ordered'] + Test['target']

#recalculate including last order
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)

#recalculate including last order
Test['rating'] = Test['ordered'] / Test['total_orders']

Test = Test.drop('target', axis=1)
Test.head(3)

Unnamed: 0,user_id,category,ordered,id,total_orders,rating,total_ordered
0,7,0,1,7;0,11,0.090909,14190
1,8,0,1,8;0,8,0.125,14190
2,9,0,1,9;0,46,0.021739,14190


In [26]:
# Test = Test.drop(['category', 'id', 'user_id'], axis=1)
y_submission = automl.predict(Test)

In [27]:
y_submission

array([[0.07309611],
       [0.08588614],
       [0.02961768],
       ...,
       [0.11730985],
       [0.09586677],
       [0.08511665]], dtype=float32)

In [31]:
th = 0.5
train_mean = Train.target.mean()
test_mean = (y_submission.data > th).astype(int).mean()

while test_mean < train_mean:
    th -= 0.005
    test_mean = (y_submission.data > th).astype(int).mean()

print('Threshold:', "%.4f" % th)
print('Train mean:', "%.5f" % train_mean)
print('New Test mean:', "%.5f" % test_mean)


Threshold: 0.2450
Train mean: 0.23596
New Test mean: 0.23608


In [32]:
Test['target'] = (y_submission.data > th).astype(int)
submit = pd.merge(sample_submission['id'], Test[['id', 'target']], on='id')
submit.to_csv('submission.csv', index=False)

# **LightGBM**

In [21]:
y_Train = Train_set.target
X_Train = Train_set.drop(['target'], axis=1)
y_Valid = Valid_set.target
X_Valid = Valid_set.drop(['target'], axis=1)
print(y_Train.head(3), '\n')
print(X_Train.head(3), '\n')
print(y_Valid.head(3), '\n')
print(X_Valid.head(3), '\n')

188985    1
282456    0
714566    0
Name: target, dtype: int64 

        user_id category  ordered        id  total_orders    rating  \
188985    11873       57       10  11873;57            16  0.625000   
282456     4940       92        1   4940;92            22  0.045455   
714566     3896      443        1  3896;443            25  0.040000   

        total_ordered  
188985          77000  
282456          17148  
714566          10430   

349792    0
239817    0
49003     0
Name: target, dtype: int64 

        user_id category  ordered        id  total_orders    rating  \
349792     9954      160        1  9954;160             9  0.111111   
239817      851       84        1    851;84             6  0.166667   
49003     17736       15        1  17736;15             8  0.125000   

        total_ordered  
349792           2804  
239817          48247  
49003           18226   



In [None]:
import lightgbm as lgb

Train_set = pd.get_dummies(Train_set, columns=['category', 'id'])

X = Train_set.drop('target', axis=1)
y = Train_set['target']

# params
params = {
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'n_estimators': 20
}

# create lightgbm
d_train = lgb.Dataset(X, label=y)
d_test = lgb.Dataset(X, label=y, reference=d_train)

# train the model
gbm = lgb.train(params, d_train, num_boost_round=100, valid_sets=[d_test], early_stopping_rounds=10)

# Score
y_pred = gbm.predict(X)
score = f1_score(y_true=y, y_pred=y_pred)
print(score)
# score = roc_auc_score(y_test, y_pred)
# print('AUC score:', score)

# # Сохранение модели
# gbm.save_model('model.txt')

# # Загрузка сохраненной модели
# bst = lgb.Booster(model_file='model.txt')

# # Предсказание на новых данных
# new_data = pd.read_csv('new_data.csv')
# new_pred = bst.predict(new_data)


In [None]:
Test = Train.copy()

#increment counter
Test['total_orders'] += 1

#add last purchase
Test['ordered'] = Test['ordered'] + Test['target']

#recalculate including last order
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)

#recalculate including last order
Test['rating'] = Test['ordered'] / Test['total_orders']

Test = Test.drop(['target'], axis=1)
Test.head(3)

In [None]:
y_submission = gbm.predict(Test)
y_submission