In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alpha-summer-challenge/df_transaction.pa
/kaggle/input/alpha-summer-challenge/train.pa


**Подготовка данных**

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Загрузка данных
df_train = pd.read_parquet("/kaggle/input/alpha-summer-challenge/train.pa")
df_txn   = pd.read_parquet("/kaggle/input/alpha-summer-challenge/df_transaction.pa")

In [4]:
df_txn.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount
0,0,2024-07-18 16:04:00,8099,a011100358d0f73ea8f3e860ef5564e3ba9cb217b7b90c...,2900
1,0,2024-07-22 16:31:00,5411,f3855606fc7244ec2f37ea01a4b2b66933d0e965bf4aec...,455
2,0,2024-07-24 16:23:00,5541,786270fa33ad4ac2a3c0e52e888005aa7f98beadbf8986...,1003
3,0,2024-07-28 15:51:00,5691,54887ad4a8df7e260a3ac85e59128a947c50d4423f6330...,1480
4,0,2024-07-28 18:00:00,5331,21617559a372c7cca155208c87be6c84ce97b5f8775589...,88


In [5]:
# Создание нового датафрейма с агрегированными данными
agg = df_txn.groupby(['client_num']).agg(
    total_amount=('amount', 'sum'),                     
    mean_amount=('amount', 'mean'),                     
    std_amount=('amount', 'std'),                    
    max_amount=('amount', 'max'),                      
    min_amount=('amount', 'min'),                      
    txn_count=('amount', 'count'),                     
).reset_index()

In [6]:
# Разделение датафрейма для обучения и теста
# Получение списка клиентов, которые есть в df_train
train_clients = set(df_train['client_num'].unique())

# Разделяем df_txn на две части
agg_train = agg[agg['client_num'].isin(train_clients)]
agg_test = agg[~agg['client_num'].isin(train_clients)]

In [7]:
# Добавление target к agg_train
agg_train_with_target = agg_train.merge(
    df_train[['client_num', 'target']], 
    on='client_num',                     
    how='left'                           
)

In [8]:
agg_train_with_target.head()

Unnamed: 0,client_num,total_amount,mean_amount,std_amount,max_amount,min_amount,txn_count,target
0,1,863878,3599.491667,11704.843812,100000,6,240,4
1,2,344108,1147.026667,2629.178018,24496,23,300,5
2,3,1621825,11032.823129,86498.559476,1000000,1,147,3
3,4,199796,1637.672131,4938.356295,50000,24,122,5
4,5,67359,391.622093,1183.405909,10000,22,172,2


**Обучение модели**

In [9]:
from sklearn.model_selection import train_test_split

# Разделение признаков и целевую переменную
X = agg_train_with_target.drop(columns=['client_num','target'])
y = agg_train_with_target['target']

# Разделение на тестовую и валидационную выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [11]:
# Обучение модели
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced', 
    random_state=42
)
clf.fit(X_train, y_train)

In [12]:
# Предсказание
y_pred = clf.predict(X_test)

# Оценка
print(classification_report(y_test, y_pred, digits=3))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0      0.335     0.406     0.367      3018
           1      0.412     0.295     0.344      3604
           2      0.193     0.114     0.144      1975
           3      0.171     0.178     0.175      1597
           4      0.237     0.273     0.253      1607
           5      0.173     0.170     0.172      1101
           6      0.172     0.314     0.222      1098

    accuracy                          0.269     14000
   macro avg      0.242     0.250     0.239     14000
weighted avg      0.279     0.269     0.268     14000

[[1226  668  183  240  185  121  395]
 [1032 1062  355  400  281  140  334]
 [ 441  399  226  320  231  114  244]
 [ 273  197  155  284  360  132  196]
 [ 257  115  114  217  438  213  253]
 [ 215   82   63   90  223  187  241]
 [ 213   54   77  105  132  172  345]]


**WMAE**

In [13]:
# Веса по классам
class_weights = {
    0: 1.00,
    1: 0.72,
    2: 0.52,
    3: 0.37,
    4: 0.27,
    5: 0.19,
    6: 0.14,
    7: 0.1
}

def wmae(y_true, y_pred, weights_map):
    weights = np.array([weights_map.get(y, 0.0) for y in y_true])
    abs_errors = np.abs(y_true - y_pred)
    return np.sum(weights * abs_errors) / np.sum(weights)

wmae_score = wmae(y_test, y_pred, class_weights)
print("WMAE:", wmae_score)

WMAE: 1.6746948329320082


In [14]:
X_test_2 = agg_test.drop(columns=['client_num'])
y_pred_2 = clf.predict(X_test_2)

In [15]:
with open('/kaggle/working/pred.txt','w') as f:
    f.write(f'{y_pred_2}')