In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [82]:
transactions = pd.read_parquet('df_transaction.pa')

In [83]:
transactions.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount
0,0,2024-07-18 16:04:00,8099,a011100358d0f73ea8f3e860ef5564e3ba9cb217b7b90c...,2900
1,0,2024-07-22 16:31:00,5411,f3855606fc7244ec2f37ea01a4b2b66933d0e965bf4aec...,455
2,0,2024-07-24 16:23:00,5541,786270fa33ad4ac2a3c0e52e888005aa7f98beadbf8986...,1003
3,0,2024-07-28 15:51:00,5691,54887ad4a8df7e260a3ac85e59128a947c50d4423f6330...,1480
4,0,2024-07-28 18:00:00,5331,21617559a372c7cca155208c87be6c84ce97b5f8775589...,88


In [84]:
df_train = pd.read_parquet('train.pa')

In [85]:
df_train.head()

Unnamed: 0,client_num,target
0,94779,3
1,17279,0
2,5717,2
3,27471,1
4,72725,0


In [86]:
transactions.describe()

Unnamed: 0,client_num,date_time,amount
count,13508160.0,13508155,13508160.0
mean,38802.6,2024-08-16 14:05:54.827905,2341.709
min,0.0,2024-07-01 00:00:00,0.0
25%,15500.0,2024-07-24 15:19:00,123.0
50%,34181.0,2024-08-16 20:12:00,315.0
75%,58278.0,2024-09-08 14:21:00,820.0
max,109142.0,2024-10-01 00:00:00,8680000.0
std,27419.52,,22926.57


In [92]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13508155 entries, 0 to 13508154
Data columns (total 5 columns):
 #   Column         Dtype         
---  ------         -----         
 0   client_num     int64         
 1   date_time      datetime64[us]
 2   mcc_code       object        
 3   merchant_name  object        
 4   amount         int64         
dtypes: datetime64[us](1), int64(2), object(2)
memory usage: 515.3+ MB


In [94]:
transactions = transactions.merge(df_train[['client_num', 'target']], on='client_num', how='left')

In [116]:
transactions['time_of_day'] = transactions['date_time'].dt.hour // 6  
transactions['day_of_week'] = transactions['date_time'].dt.dayofweek  # 0=понедельник, 6=воскресенье
transactions['is_weekend'] = transactions['day_of_week'].isin([5, 6]).astype(int) 
# Частотные признаки
transactions['transaction_count_last_7_days'] = transactions.groupby('client_num')['date_time'].transform(
    lambda x: x.diff().dt.days.le(7).cumsum()
)
# Финансовые признаки
# Финансовые признаки
transactions['avg_amount_last_7_days'] = transactions.groupby('client_num')['amount'].transform(
    lambda x: x.rolling(7, min_periods=1).mean()
)
transactions['total_amount_last_7_days'] = transactions.groupby('client_num')['amount'].transform(
    lambda x: x.rolling(7, min_periods=1).sum()
)

# Серийные признаки
transactions['time_since_last_transaction'] = transactions.groupby('client_num')['date_time'].transform(
    lambda x: x.diff().dt.total_seconds()
)
transactions['transaction_series'] = transactions.groupby('client_num')['date_time'].transform(
    lambda x: (x.diff().dt.total_seconds() < 3600).cumsum()
)


In [124]:
transactions['time_since_last_transaction'].fillna(0, inplace=True)

In [126]:
transactions.isna().sum()

client_num                             0
date_time                              0
mcc_code                               0
merchant_name                          0
amount                                 0
target                           4828430
time_of_day                            0
day_of_week                            0
is_weekend                             0
transaction_count_last_7_days          0
avg_amount_last_7_days                 0
total_amount_last_7_days               0
time_since_last_transaction            0
transaction_series                     0
dtype: int64

In [128]:
transactions.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount,target,time_of_day,day_of_week,is_weekend,transaction_count_last_7_days,avg_amount_last_7_days,total_amount_last_7_days,time_since_last_transaction,transaction_series
0,0,2024-07-18 16:04:00,8099,a011100358d0f73ea8f3e860ef5564e3ba9cb217b7b90c...,2900,,2,3,0,0,2900.0,2900.0,0.0,0
1,0,2024-07-22 16:31:00,5411,f3855606fc7244ec2f37ea01a4b2b66933d0e965bf4aec...,455,,2,0,0,1,1677.5,3355.0,347220.0,0
2,0,2024-07-24 16:23:00,5541,786270fa33ad4ac2a3c0e52e888005aa7f98beadbf8986...,1003,,2,2,0,2,1452.666667,4358.0,172320.0,0
3,0,2024-07-28 15:51:00,5691,54887ad4a8df7e260a3ac85e59128a947c50d4423f6330...,1480,,2,6,1,3,1459.5,5838.0,343680.0,0
4,0,2024-07-28 18:00:00,5331,21617559a372c7cca155208c87be6c84ce97b5f8775589...,88,,3,6,1,4,1185.2,5926.0,7740.0,0


In [132]:
transactions.to_parquet('transactions.pa', index=False)

In [9]:
df_final = transactions[transactions['target'].isna() == False]

In [10]:
df_final = df_final[df_final['client_num'] != 3467]

In [11]:
df_final.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount,target
132,1,2024-07-01 09:01:00,5541,84620a9333be55c5f41eb224fed1974200cc0983e33631...,4059,4.0
133,1,2024-07-01 16:57:00,5411,4829cd530ed1ec8bc8358740c2250f63ce5a42611cfa09...,444,4.0
134,1,2024-07-01 19:52:00,5200,c80805234182f4450be61e6f9605c1f16cc5c5be22a117...,5633,4.0
135,1,2024-07-01 20:11:00,5200,c80805234182f4450be61e6f9605c1f16cc5c5be22a117...,3009,4.0
136,1,2024-07-01 21:44:00,5411,f932e883f0c291aca1c74315b30ec799bf37b4eadfee14...,5481,4.0


In [12]:
df_final.isna().sum().sum()

0

In [13]:
df_final['date_time'] = pd.to_datetime(df_final['date_time'])

In [14]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8676361 entries, 132 to 13508154
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   client_num     int64         
 1   date_time      datetime64[us]
 2   mcc_code       object        
 3   merchant_name  object        
 4   amount         int64         
 5   target         float64       
dtypes: datetime64[us](1), float64(1), int64(2), object(2)
memory usage: 463.4+ MB


In [15]:
df_final['mcc_code'] = df_final['mcc_code'].astype(int)

In [16]:
df_final['merchant_name'].nunique()

554266

In [17]:
# очень много уникальных значений, поэтому label_enc
# сразу запишу идею, можно попробовать кодировать исходя из целевой переменной(на будущее)

In [18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [19]:
df_final['merchant_name'] = label_encoder.fit_transform(df_final['merchant_name'])

In [20]:
df_final.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount,target
132,1,2024-07-01 09:01:00,5541,286426,4059,4.0
133,1,2024-07-01 16:57:00,5411,156007,444,4.0
134,1,2024-07-01 19:52:00,5200,432832,5633,4.0
135,1,2024-07-01 20:11:00,5200,432832,3009,4.0
136,1,2024-07-01 21:44:00,5411,539545,5481,4.0


In [21]:
df_final['timestamp'] = df_final['date_time'].astype('int64') // 10**7

In [22]:
df_final.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount,target,timestamp
132,1,2024-07-01 09:01:00,5541,286426,4059,4.0,171982446
133,1,2024-07-01 16:57:00,5411,156007,444,4.0,171985302
134,1,2024-07-01 19:52:00,5200,432832,5633,4.0,171986352
135,1,2024-07-01 20:11:00,5200,432832,3009,4.0,171986466
136,1,2024-07-01 21:44:00,5411,539545,5481,4.0,171987024


In [23]:
df_final.columns.values

array(['client_num', 'date_time', 'mcc_code', 'merchant_name', 'amount',
       'target', 'timestamp'], dtype=object)

In [24]:
# Теперь создадим последовательности для каждого клиента
def create_sequence(client_df):
    sequence = client_df[['timestamp', 'mcc_code', 'merchant_name', 'amount']].values
    return pd.Series({
        'transaction_sequence': sequence,
        'target': client_df['target'].iloc[0]  # Преобразуем target для клиента
    })

In [25]:
client_sequences = df_final.groupby('client_num').apply(create_sequence)

  client_sequences = df_final.groupby('client_num').apply(create_sequence)


In [26]:
client_sequences['transaction_sequence'].iloc[0].shape

(240, 4)

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

targets = client_sequences['target'].values
client_sequences = pad_sequences(client_sequences['transaction_sequence'].tolist(), padding='post', dtype='float32')

# Получаем целевые значения


In [28]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

X_train, X_test, y_train, y_test = train_test_split(client_sequences, targets, test_size=0.2, random_state=42)


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Перемещаем данные на устройство
X_train_device = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_device = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_device = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_device = torch.tensor(y_test, dtype=torch.long).to(device)

# Создаем даталоадеры
train_dataset = TensorDataset(X_train_device, y_train_device)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_device, y_test_device)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Используемое устройство: cuda


In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking

# Проверяем доступные устройства
print("Доступные устройства:", tf.config.list_physical_devices('GPU'))

with tf.device('/GPU:0'):
    model = Sequential()
    model.add(Masking(mask_value=0.0, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(np.unique(targets)), activation='softmax'))

    # Компилируем модель
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


Доступные устройства: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


  super().__init__(**kwargs)


In [40]:
import torch.nn.functional as F
# Определяем модель
class LinformerAttention(nn.Module):
    def __init__(self, input_dim, k):
        super(LinformerAttention, self).__init__()
        self.input_dim = input_dim
        self.k = k

        # Линейные проекции для query, ключей и значений
        self.proj_q = nn.Linear(input_dim, k)
        self.proj_k = nn.Linear(input_dim, k)
        self.proj_v = nn.Linear(input_dim, k)

    def forward(self, query, key, value):
        # Линейные проекции
        query_proj = self.proj_q(query)  # (batch_size, seq_len, k)
        key_proj = self.proj_k(key)  # (batch_size, seq_len, k)
        value_proj = self.proj_v(value)  # (batch_size, seq_len, k)

        # Вычисление самовнимания
        attention_scores = torch.matmul(query_proj, key_proj.transpose(-2, -1))  # (batch_size, seq_len, seq_len)
        attention_weights = F.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_weights, value_proj)  # (batch_size, seq_len, k)

        return output
class LSTMWithLinformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, k):
        super(LSTMWithLinformer, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=2, dropout=0.2)
        self.linformer = LinformerAttention(hidden_size, k)

        # Полносвязный слой принимает вход с размерностью k
        self.fc = nn.Linear(k, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # Проход через LSTM
        lstm_output, _ = self.lstm(x)  # (batch_size, seq_len, hidden_size)

        # Проход через Linformer
        linformer_output = self.linformer(lstm_output, lstm_output, lstm_output)  # (batch_size, seq_len, k)

        # Берем последний выход LSTM
        output = self.dropout(linformer_output[:, -1, :])  # (batch_size, k)

        # Полносвязный слой
        output = self.fc(output)  # (batch_size, output_size)

        return output

# Создаем модель и перемещаем её на устройство
input_size = X_train.shape[2]  # Количество признаков
hidden_size = 128  # Размер скрытого слоя LSTM
output_size = len(np.unique(targets))  # Количество классов
k = 64  # Размер проекции в Linformer

model = LSTMWithLinformer(input_size, hidden_size, output_size, k).to(device)

# Определяем функцию потерь и оптимизатор
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Функция для обучения модели
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Обнуляем градиенты
        optimizer.zero_grad()

        # Прямой проход
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Обратный проход и оптимизация
        loss.backward()
        optimizer.step()

        # Считаем статистику
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_acc = 100. * correct / total
    return train_loss, train_acc

# Функция для оценки модели
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Прямой проход
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Считаем статистику
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    test_loss = running_loss / len(test_loader)
    test_acc = 100. * correct / total
    return test_loss, test_acc

In [41]:
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)

    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
          f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

Epoch 1/10, Train Loss: 1.8022, Train Acc: 26.85%, Test Loss: 1.7956, Test Acc: 28.10%
Epoch 2/10, Train Loss: 1.7973, Train Acc: 27.16%, Test Loss: 1.7907, Test Acc: 27.74%
Epoch 3/10, Train Loss: 1.7967, Train Acc: 27.46%, Test Loss: 1.8002, Test Acc: 27.19%
Epoch 4/10, Train Loss: 1.7956, Train Acc: 27.35%, Test Loss: 1.7906, Test Acc: 27.86%
Epoch 5/10, Train Loss: 1.7953, Train Acc: 27.30%, Test Loss: 1.7911, Test Acc: 27.31%
Epoch 6/10, Train Loss: 1.7939, Train Acc: 27.37%, Test Loss: 1.7908, Test Acc: 28.26%
Epoch 7/10, Train Loss: 1.7931, Train Acc: 27.47%, Test Loss: 1.7919, Test Acc: 28.36%
Epoch 8/10, Train Loss: 1.7931, Train Acc: 27.55%, Test Loss: 1.7897, Test Acc: 28.09%
Epoch 9/10, Train Loss: 1.7930, Train Acc: 27.51%, Test Loss: 1.7894, Test Acc: 28.01%
Epoch 10/10, Train Loss: 1.7929, Train Acc: 27.52%, Test Loss: 1.7910, Test Acc: 27.96%
