In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **IMPORTS**

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np

# **DATA**

In [3]:
sample_submission = pd.read_csv('/content/drive/MyDrive/SberMarketdata/sample_submission.csv') # submission
sample_submission.head(-1)

Unnamed: 0,id,target
0,0;133,0
1,0;5,1
2,0;10,0
3,0;396,1
4,0;14,0
...,...,...
790443,19998;57,0
790444,19998;26,0
790445,19998;31,0
790446,19998;29,1


In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/SberMarketdata/train.csv') # train_data
print(train_data.shape)
train_data.head(-1)

(3123064, 3)


Unnamed: 0,user_id,order_completed_at,cart
0,2,2015-03-22 09:25:46,399
1,2,2015-03-22 09:25:46,14
2,2,2015-03-22 09:25:46,198
3,2,2015-03-22 09:25:46,88
4,2,2015-03-22 09:25:46,157
...,...,...,...
3123058,12702,2020-09-03 23:45:45,445
3123059,12702,2020-09-03 23:45:45,441
3123060,12702,2020-09-03 23:45:45,92
3123061,12702,2020-09-03 23:45:45,431


**Make data ready for model**

In [5]:
# make sparse matrix for categories
%%time
cart_matrix = pd.get_dummies(train_data, columns = ['cart'], prefix='', prefix_sep='', dtype='bool')
cart_matrix = cart_matrix.groupby(['user_id', 'order_completed_at']).any().reset_index()
# %%time
# # Creating a sparse matrix by category using pd.pivot_table
# cart_matrix = pd.pivot_table(train_data, index=['user_id', 'order_completed_at'], columns='cart', aggfunc='size', fill_value=0)
# cart_matrix = cart_matrix.reset_index()
# cart_matrix.head(3)

CPU times: user 56.3 s, sys: 3.07 s, total: 59.4 s
Wall time: 1min 4s


In [6]:
# Per-User Order Count: Adds a new order_number column that specifies the order number for each user.
cart_matrix['ordered'] = cart_matrix.groupby(['user_id']).cumcount()
cart_matrix.describe()

Unnamed: 0,user_id,ordered
count,209406.0,209406.0
mean,7649.46078,11.50393
std,5427.748315,16.004626
min,0.0,0.0
25%,3161.0,2.0
50%,6576.0,6.0
75%,11808.0,14.0
max,19999.0,212.0


In [7]:
#delete order_completed_at
cart_matrix = cart_matrix.drop('order_completed_at', axis=1)

In [8]:
last_order = cart_matrix.groupby(['user_id'])['ordered'].transform(max) == cart_matrix['ordered']
train = cart_matrix[~last_order].groupby('user_id').sum().reset_index() #A training set, train, is created, in which rows corresponding to the last orders of users are removed (~last_order inverts the Boolean mask, selecting rows that are not the last orders)
valid = cart_matrix[last_order].reset_index(drop=True) #A validating set, which including only rows corresponding to users' most recent orders.
last_order.head(3)

0    False
1    False
2     True
Name: ordered, dtype: bool

In [9]:
train_melt = pd.melt(train, id_vars=['user_id'], var_name='category', value_name='ordered')
valid_melt = pd.melt(valid, id_vars=['user_id'], var_name='category', value_name='target')

  train_melt = pd.melt(train, id_vars=['user_id'], var_name='category', value_name='ordered')


In [10]:
Train = train_melt.copy()
Train.head(3) #trainig set

Unnamed: 0,user_id,category,ordered
0,0,0,0
1,1,0,0
2,2,0,1


In [11]:
valid_melt.head(3) # validating set

Unnamed: 0,user_id,category,target
0,0,0,False
1,1,0,False
2,2,0,False


In [12]:
# user_id / category as in submission file
Train['id'] = Train['user_id'].astype(str) + ';' + Train['category']

# target variable (the last known purchase)
Train['target'] = valid_melt['target'].astype(int)

In [13]:
Train.head(3)

Unnamed: 0,user_id,category,ordered,id,target
0,0,0,0,0;0,0
1,1,0,0,1;0,0
2,2,0,1,2;0,0


In [14]:
order_number = valid[['user_id', 'ordered']].set_index('user_id')['ordered'].squeeze()
Train['total_orders'] = Train['user_id'].map(order_number)
Train.head(3)
Train['rating'] = Train['ordered']/Train['total_orders']
Train.head(3)

Unnamed: 0,user_id,category,ordered,id,target,total_orders,rating
0,0,0,0,0;0,0,2,0.0
1,1,0,0,1;0,0,8,0.0
2,2,0,1,2;0,0,14,0.071429


In [15]:
#remove those users/categories who are not represented in the submission file
Train = Train[Train.id.isin(sample_submission.id.unique())].reset_index(drop=True)
#Check
print((Train.sort_values('id')['id'].values == Train.sort_values('id')['id'].values).all())

True


In [16]:
#purchase counter by all user (for represetned users)
total_ordered = Train.groupby('category')['ordered'].sum()
Train['total_ordered'] = Train['category'].map(total_ordered)
print(Train.head(3))

   user_id category  ordered   id  target  total_orders    rating  \
0        7        0        0  7;0       1            10  0.000000   
1        8        0        1  8;0       0             7  0.142857   
2        9        0        1  9;0       0            45  0.022222   

   total_ordered  
0          12922  
1          12922  
2          12922  


In [17]:
Train.dtypes

user_id            int64
category          object
ordered            int64
id                object
target             int64
total_orders       int64
rating           float64
total_ordered      int64
dtype: object

# **DL/NN**

In [18]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

*I will train the model on GPU*

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
%%time
Train['category'] = Train['category'].astype('category').cat.codes
#X = pd.get_dummies(Train[['user_id', 'category', 'total_orders', 'rating']])
X = Train[['user_id', 'category', 'total_orders', 'rating']]
y = Train['target']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# make data2tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# create DataLoader and train/valid sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# create the model
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.layer1 = nn.Linear(X_train.shape[1], 32)
    self.layer2 = nn.Linear(32, 16)
    self.layer3 = nn.Linear(16, 1)

  def forward(self, x):
    x = torch.relu(self.layer1(x))
    x = torch.relu(self.layer2(x))
    x = torch.sigmoid(self.layer3(x))
    return x

model_dl = Net().to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model_dl.parameters(), lr=0.001)

#train the model
for epoch in tqdm(range(10)):
  running_loss = 0.0
  for inputs, labels in train_loader:
      inputs, labels = inputs.to(device), labels.to(device)
      optimizer.zero_grad()
      out = model_dl(inputs)
      loss = criterion(out, labels)
      loss.backward()
      optimizer.step()
      running_loss += loss.item()
  print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

print('Training successful finish')

#model evalution
model_dl.eval()
predictions = []
true_labels = []

with torch.no_grad():
  for inputs, labels in test_loader:
      inputs, labels = inputs.to(device), labels.to(device)
      out = model_dl(inputs)
      predicted = (out > 0.5).float().cpu().numpy()
      predictions.extend(predicted.flatten())
      true_labels.extend(labels.cpu().numpy().flatten())

#f1_score
f1 = f1_score(true_labels, predictions)
print(f'F1: {f1}')

 10%|█         | 1/10 [00:40<06:04, 40.51s/it]

Epoch 1, Loss: 10.583328895983287


 20%|██        | 2/10 [01:19<05:17, 39.65s/it]

Epoch 2, Loss: 0.5388499779952657


 30%|███       | 3/10 [01:58<04:36, 39.49s/it]

Epoch 3, Loss: 0.5301125542131673


 40%|████      | 4/10 [02:38<03:56, 39.42s/it]

Epoch 4, Loss: 0.5268253632211791


 50%|█████     | 5/10 [03:18<03:17, 39.57s/it]

Epoch 5, Loss: 0.5298356364226006


 60%|██████    | 6/10 [03:57<02:37, 39.50s/it]

Epoch 6, Loss: 0.5297852286317005


 70%|███████   | 7/10 [04:36<01:58, 39.43s/it]

Epoch 7, Loss: 0.5283291821662394


 80%|████████  | 8/10 [05:15<01:18, 39.35s/it]

Epoch 8, Loss: 0.5273175920709378


 90%|█████████ | 9/10 [05:54<00:39, 39.26s/it]

Epoch 9, Loss: 0.5245253068723246


100%|██████████| 10/10 [06:36<00:00, 39.67s/it]

Epoch 10, Loss: 0.5201952677016894
Training successful finish





F1: 0.03633442181101954
CPU times: user 6min 24s, sys: 10.3 s, total: 6min 35s
Wall time: 6min 48s


In [21]:
Test = Train.copy() #copy Train in Test

#increment counter
Test['total_orders'] += 1

#add last purchase
Test['ordered'] = Test['ordered'] + Test['target']

#recalculate including last order
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)

#recalculate including last order
Test['rating'] = Test['ordered'] / Test['total_orders']

Test = Test.drop('target', axis=1)
print(Test.head(3), '\n', Test.dtypes, '\n', Test.shape, '\n')

Test['category'] = Test['category'].astype('category').cat.codes
id = Test['id']
Test = Test[['user_id', 'category', 'total_orders', 'rating']]
#Test = pd.get_dummies(Test[['user_id', 'category', 'total_orders', 'rating']])
print(Test.head(3), '\n', Test.dtypes, '\n', Test.shape, '\n')

   user_id  category  ordered   id  total_orders    rating  total_ordered
0        7         0        1  7;0            11  0.090909          14190
1        8         0        1  8;0             8  0.125000          14190
2        9         0        1  9;0            46  0.021739          14190 
 user_id            int64
category           int16
ordered            int64
id                object
total_orders       int64
rating           float64
total_ordered      int64
dtype: object 
 (790449, 7) 

   user_id  category  total_orders    rating
0        7         0            11  0.090909
1        8         0             8  0.125000
2        9         0            46  0.021739 
 user_id           int64
category          int16
total_orders      int64
rating          float64
dtype: object 
 (790449, 4) 



In [23]:
sub_tensor = torch.tensor(Test.values, dtype=torch.float32)
#model predict on Test data
model_dl.eval()
inputs = torch.tensor(sub_tensor, dtype=torch.float32)

with torch.no_grad():
    predictions_sub = model_dl(inputs.to(device))

predictions_sub = (predictions_sub > 0.5).float().cpu().numpy()

print(predictions_sub, len(predictions_sub))

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]] 790449


  inputs = torch.tensor(sub_tensor, dtype=torch.float32)


In [35]:
predictions_sub_flat = predictions_sub.flatten()
predictions_series = pd.Series(predictions_sub_flat)
df_sub = pd.concat([id, predictions_series], axis=1)
df_sub = df_sub.rename(columns={0: 'target'})
df_sub['target'] = df_sub['target'].astype(int)
df_sub

Unnamed: 0,id,target
0,7;0,0
1,8;0,0
2,9;0,0
3,12;0,0
4,13;0,0
...,...,...
790444,3238;880,0
790445,4816;880,0
790446,10280;880,0
790447,13281;880,0


In [36]:
submit = pd.merge(sample_submission['id'], df_sub[['id', 'target']], on='id')
submit.to_csv('submission_dl.csv', index=False)
print(submit.head(3))

      id  target
0  0;133       0
1    0;5       0
2   0;10       0


In [34]:
submit.target.sum()

5458.0

# **AUTOML**

In [18]:
Train_set, Valid_set = train_test_split(Train, test_size=0.2, random_state=42, stratify=None) # split Train for train_set and valid_set

we have big dataset and task of binary classification. I think is better decision use automl

In [41]:
%pip install -U lightautoml

Collecting lightautoml
  Using cached lightautoml-0.3.8.1-py3-none-any.whl (416 kB)
Collecting autowoe>=1.2 (from lightautoml)
  Using cached AutoWoE-1.3.2-py3-none-any.whl (215 kB)
Collecting catboost>=0.26.1 (from lightautoml)
  Using cached catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
Collecting cmaes (from lightautoml)
  Using cached cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting joblib<1.3.0 (from lightautoml)
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting json2html (from lightautoml)
  Using cached json2html-1.3.0.tar.gz (7.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lightgbm<=3.2.1,>=2.3 (from lightautoml)
  Using cached lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
Collecting optuna (from lightautoml)
  Using cached optuna-3.5.0-py3-none-any.whl (413 kB)
Collecting poetry-core<2.0.0,>=1.0.0 (from lightautoml)
  Using cached poetry_core-1.9.0-py3-none-any.whl (309 kB)
Collecting statsmodels<=0.14.0 (from lig

In [19]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [20]:
Valid_set

Unnamed: 0,user_id,category,ordered,id,target,total_orders,rating,total_ordered
349792,9954,160,1,9954;160,0,9,0.111111,2804
239817,851,84,1,851;84,0,6,0.166667,48247
49003,17736,15,1,17736;15,0,8,0.125000,18226
535727,13596,392,1,13596;392,0,7,0.142857,11228
776699,10856,812,1,10856;812,0,6,0.166667,7726
...,...,...,...,...,...,...,...,...
335425,15758,149,1,15758;149,0,8,0.125000,5078
415053,10168,228,1,10168;228,0,5,0.200000,457
295712,5047,100,1,5047;100,0,8,0.125000,16889
741069,477,798,2,477;798,0,7,0.285714,15579


In [30]:
%%time
def f1 (real, pred, **kwargs):
    return f1_score(real, (pred > 0.5).astype(int), **kwargs)

roles = {'target': 'target', 'drop': ['user_id', 'category', 'id']}
task = Task('binary', metric = f1)

automl = TabularAutoML(task=task,
                       timeout=300,
                       cpu_limit=4,
                       reader_params={'n_jobs': 4, 'cv': 5, 'random_state': 17},
                       general_params={'use_algos': ['linear_l2', 'xgboost', 'lightgbm', 'catboost']},
                       tuning_params={'n_trials': 100, 'max_iter': 100}
                      )

train_pred = automl.fit_predict(Train_set, roles = roles)
print('Score', "%.5f" % f1(Train_set.target, train_pred.data))

valid_pred = automl.predict(Valid_set)
print('Score on out of folds validation', "%.5f" % f1(Valid_set.target, valid_pred.data))

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: binary

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 300.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (632359, 8)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 291.61 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [], 'embed_sizes': (), 'data_size': 6}
INFO2:lightautoml.ml

Score 0.57575
Score on out of folds validation 0.57544
CPU times: user 22 s, sys: 400 ms, total: 22.4 s
Wall time: 24 s


Let's fit model

In [None]:
# X_train = train_set.drop(['target', 'id', 'total_ordered'], axis=1)
# y_train = train_set['target']
# print(f'X_train: \n {X_train.head(3)} \n')
# print(f'y_train: \n {y_train.head(3)} \n')
# print(f'X_train.shape: {X_train.shape} \n train_set.shape: {train_set.shape}')

In [31]:
best_score = 0
for i in np.arange(0.01, 1.0, 0.01):
    score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
    if score > best_score:
        best_score = score
        proba_split = i

print('At i =', "%.2f" % proba_split,'score is : ' "%.5f" % best_score)

At i = 0.29 score is : 0.62031


In [32]:
Test = Train.copy() #copy Train in Test

#increment counter
Test['total_orders'] += 1

#add last purchase
Test['ordered'] = Test['ordered'] + Test['target']

#recalculate including last order
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)

#recalculate including last order
Test['rating'] = Test['ordered'] / Test['total_orders']

Test = Test.drop('target', axis=1)
Test.head(3)

Unnamed: 0,user_id,category,ordered,id,total_orders,rating,total_ordered
0,7,0,1,7;0,11,0.090909,14190
1,8,0,1,8;0,8,0.125,14190
2,9,0,1,9;0,46,0.021739,14190


In [33]:
# Test = Test.drop(['category', 'id', 'user_id'], axis=1)
y_submission = automl.predict(Test)

In [34]:
y_submission

array([[0.07309611],
       [0.08588614],
       [0.02961768],
       ...,
       [0.11730985],
       [0.09586677],
       [0.08511665]], dtype=float32)

In [35]:
th = 0.5
train_mean = Train.target.mean()
test_mean = (y_submission.data > th).astype(int).mean()

while test_mean < train_mean:
    th -= 0.005
    test_mean = (y_submission.data > th).astype(int).mean()

print('Threshold:', "%.4f" % th)
print('Train mean:', "%.5f" % train_mean)
print('New Test mean:', "%.5f" % test_mean)


Threshold: 0.2450
Train mean: 0.23596
New Test mean: 0.23608


In [36]:
Test['target'] = (y_submission.data > th).astype(int)
submit = pd.merge(sample_submission['id'], Test[['id', 'target']], on='id')
submit.to_csv('submission_automl.csv', index=False)
print(submit.head(3))

      id  target
0  0;133       0
1    0;5       0
2   0;10       0
