In [27]:
import os
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb


In [2]:
files = ['clickstream/' + file for file in sorted(os.listdir('clickstream'))]

In [3]:
def get_train_test(k=1, hist=10):
    '''
    Функция извлекает из таблицы data_for_model данные для тренировки. 
    train - список, который содержит последние hist сессий пользователя.
    test - список, который содержит k-ое действие с конца для каждого пользователя. Нам нужно предсказать
    действия пользователя для k = 1. Для валидации можно выбрать k = 2 (или больше).
    '''
    
    
    trains = []
    tests = []
    
    target = target = pd.read_csv('data_for_model.csv', nrows=1000000 - 1)
    cols_to_drop = ['session_id', 'client_pin', 'timestamp', 'multi_class_target', 
               'prev_session', 'num', 'session_id_y', 'client_y'] + ['ohe_' + str(i) for i in range(10)]
    
    columns = target.columns
    
    train_inds = (target.num >= k) & (target.num <= k + hist)
    test_inds = (target.num == k - 1)

    X_train = target[train_inds]
    y_train = target[train_inds]['multi_class_target']

    X_test = target[test_inds]
    y_test = target[test_inds]['multi_class_target']
    
    del target
    
    trains.append((X_train, y_train))
    tests.append((X_test, y_test))
    
    for i in range(1, 6):
        print(i)
        target = pd.read_csv('data_for_model.csv', nrows=10 ** 6, skiprows= i * 10 ** 6, header=None)
        
        target.columns = columns
        
        train_inds = (target.num >= k) & (target.num <= k + hist)
        test_inds = (target.num == k - 1)

        X_train = target[train_inds]
        y_train = target[train_inds]['multi_class_target']

        X_test = target[test_inds]
        y_test = target[test_inds]['multi_class_target']

        trains.append((X_train, y_train))
        tests.append((X_test, y_test))
        
        del target
        
    return trains, tests

In [4]:
trains, tests = get_train_test(k=2, hist=10)

1
2
3
4
5


In [5]:
X_train = pd.concat([x[0] for x in trains])
y_train = pd.concat([x[1] for x in trains])

In [6]:
X_train.shape

(769204, 108)

In [7]:
X_test = pd.concat([x[0] for x in tests])
y_test = pd.concat([x[1] for x in tests])

In [8]:
# Собираем паркетные данные и мержим с исходными
cs_data = []

valid_sessions = pd.concat([X_train.session_id, X_test.session_id])

for file in files:
    
    temp = pd.read_csv("Datasets/" + file.split('/')[1])
    temp = temp[temp['session_id_y'].isin(valid_sessions)]
    
    cs_data.append(temp)
    del temp
    
    print(file)

clickstream/part-00000.parquet
clickstream/part-00001.parquet
clickstream/part-00002.parquet
clickstream/part-00003.parquet
clickstream/part-00004.parquet
clickstream/part-00005.parquet
clickstream/part-00006.parquet
clickstream/part-00007.parquet
clickstream/part-00008.parquet
clickstream/part-00009.parquet


In [9]:
X_train = pd.merge(X_train, pd.concat(cs_data), 
                                how='left', left_on='session_id', right_on='session_id_y')

In [10]:
X_test = pd.merge(X_test, pd.concat(cs_data), 
                                how='left', left_on='session_id', right_on='session_id_y')

In [11]:
# Кодируем часовые пояса

tz_map = X_train['timezone'].value_counts().to_dict()

In [12]:
X_train['tz_int']  = X_train['timezone'].map(tz_map)
X_test['tz_int'] = X_train['timezone'].map(tz_map)

In [13]:
cols_to_drop = ['session_id', 'client_pin', 'timestamp_x', 'multi_class_target', 
                'mobile', 'prev_session','prev_session1', 'num', 'session_id_y', 
                'timestamp_y', 'client', 'timezone', 'tz_int'] + ['ohe_' + str(i) for i in range(10)]

In [14]:
X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=cols_to_drop)

In [15]:
def eval_metric(y_true, y_pred):
    return 'eval', f1_score(y_true, y_pred.reshape(10, -1).argmax(axis=0), average='macro'), False

In [16]:
# Веса для таргетов
weights = (1 / (y_train.value_counts() / len(y_train) + 0.2) ).to_dict()

In [17]:
weights

{5: 1.574673357287678,
 9: 2.796233029840144,
 3: 2.994866859159448,
 2: 3.984918312707741,
 8: 4.005753423516129,
 6: 4.071288461884533,
 7: 4.088991993229702,
 1: 4.194366316992548,
 0: 4.262443699684363,
 4: 4.7703520317922585}

In [18]:
weights[3] = 1.8
weights[5] = 1.6
weights[9] = 2.9
weights[1] = 3
weights[7] = 3
weights[2] = 4.5
weights[8] = 4.1

In [19]:
cols_to_drop = []

clf = lgb.LGBMClassifier(max_depth=6,
                         num_leaves=50,
                         n_estimators=200, 
                         learning_rate=0.07, 
                         random_state=50, 
                         subsample=0.7, 
                         subsample_freq=1,
                         class_weight = weights)
clf.fit(X_train.values, y_train, 
        eval_set=(X_test.values, y_test), 
        eval_metric=eval_metric)

[1]	valid_0's multi_logloss: 1.81098	valid_0's eval: 0.0601297
[2]	valid_0's multi_logloss: 1.76419	valid_0's eval: 0.0601297
[3]	valid_0's multi_logloss: 1.72464	valid_0's eval: 0.071903
[4]	valid_0's multi_logloss: 1.69059	valid_0's eval: 0.0944657
[5]	valid_0's multi_logloss: 1.66062	valid_0's eval: 0.139286
[6]	valid_0's multi_logloss: 1.63412	valid_0's eval: 0.191751
[7]	valid_0's multi_logloss: 1.61048	valid_0's eval: 0.232293
[8]	valid_0's multi_logloss: 1.58917	valid_0's eval: 0.266197
[9]	valid_0's multi_logloss: 1.5699	valid_0's eval: 0.295501
[10]	valid_0's multi_logloss: 1.5525	valid_0's eval: 0.318687
[11]	valid_0's multi_logloss: 1.53662	valid_0's eval: 0.336288
[12]	valid_0's multi_logloss: 1.52205	valid_0's eval: 0.350165
[13]	valid_0's multi_logloss: 1.50879	valid_0's eval: 0.359615
[14]	valid_0's multi_logloss: 1.49668	valid_0's eval: 0.368543
[15]	valid_0's multi_logloss: 1.48543	valid_0's eval: 0.377874
[16]	valid_0's multi_logloss: 1.47511	valid_0's eval: 0.383476


[132]	valid_0's multi_logloss: 1.33353	valid_0's eval: 0.42037
[133]	valid_0's multi_logloss: 1.33348	valid_0's eval: 0.42025
[134]	valid_0's multi_logloss: 1.33343	valid_0's eval: 0.420194
[135]	valid_0's multi_logloss: 1.33338	valid_0's eval: 0.42022
[136]	valid_0's multi_logloss: 1.33333	valid_0's eval: 0.420359
[137]	valid_0's multi_logloss: 1.33334	valid_0's eval: 0.4204
[138]	valid_0's multi_logloss: 1.3333	valid_0's eval: 0.420486
[139]	valid_0's multi_logloss: 1.33325	valid_0's eval: 0.420272
[140]	valid_0's multi_logloss: 1.33322	valid_0's eval: 0.420247
[141]	valid_0's multi_logloss: 1.33317	valid_0's eval: 0.420185
[142]	valid_0's multi_logloss: 1.33311	valid_0's eval: 0.420224
[143]	valid_0's multi_logloss: 1.33307	valid_0's eval: 0.419919
[144]	valid_0's multi_logloss: 1.33302	valid_0's eval: 0.420032
[145]	valid_0's multi_logloss: 1.33298	valid_0's eval: 0.420011
[146]	valid_0's multi_logloss: 1.33294	valid_0's eval: 0.420152
[147]	valid_0's multi_logloss: 1.33293	valid_0

LGBMClassifier(boosting_type='gbdt',
               class_weight={0: 4.262443699684363, 1: 3, 2: 4.5, 3: 1.8,
                             4: 4.7703520317922585, 5: 1.6,
                             6: 4.071288461884533, 7: 3, 8: 4.1, 9: 2.9},
               colsample_bytree=1.0, importance_type='split',
               learning_rate=0.07, max_depth=6, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
               n_jobs=-1, num_leaves=50, objective=None, random_state=59,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.7,
               subsample_for_bin=200000, subsample_freq=1)

In [20]:
train_pred = clf.predict(X_train.values)
test_pred = clf.predict(X_test.values)


In [21]:
print("train score: ", f1_score(y_train, train_pred, average='macro'))
print('test score: ', f1_score(y_test, test_pred, average='macro'))

train score:  0.4562190097556729
test score:  0.42135370599832367


In [22]:
_, tests = get_train_test(k=1, hist=10)

1
2
3
4
5


In [23]:
test = pd.concat([x[0] for x in tests])

In [24]:
# Готовим submission файл

In [25]:
preds = pd.DataFrame({"preds": test_pred, 
                      "client_pin": test['client_pin'].values})

In [28]:
enc = LabelEncoder()
enc.classes_ = np.array(['card2card_transfer', 'card_recharge', 'chat', 'credit_info',
       'invest', 'main_screen', 'mobile_recharge', 'own_transfer',
       'phone_money_transfer', 'statement'])

In [29]:
preds.preds = enc.inverse_transform(preds.preds)

In [30]:
sample_submission = pd.read_csv('alfabattle2_abattle_sample_prediction.csv')
pred_ts = pd.read_csv('alfabattle2_prediction_session_timestamp.csv')

In [31]:
subm = pd.merge(sample_submission, preds, on='client_pin', how='inner')

In [32]:
subm = subm.drop(columns='prediction')

In [33]:
subm.columns = ['client_pin', 'prediction']

In [34]:
subm.to_csv("subm.csv", index=False)