In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Чтение данных
df_transactions = pd.read_parquet('df_transaction.pa')
df_target = pd.read_parquet('train.pa')

# Убедимся, что date_time в df_transactions имеет тип datetime
df_transactions['date_time'] = pd.to_datetime(df_transactions['date_time'])

# 1. Создадим DataFrame для признаков клиентов
df_client = pd.DataFrame()

# 2. Общая сумма транзакций по каждому клиенту
df_client['total_amount'] = df_transactions.groupby('client_num')['amount'].sum()

# 3. Средняя сумма транзакции
df_client['mean_transaction'] = df_transactions.groupby('client_num')['amount'].mean()

# 4. Максимальная сумма транзакции
df_client['max_transaction'] = df_transactions.groupby('client_num')['amount'].max()

# 5. Минимальная сумма транзакции
df_client['min_transaction'] = df_transactions.groupby('client_num')['amount'].min()

# 6. Количество транзакций для клиента
df_client['transaction_count'] = df_transactions.groupby('client_num')['amount'].count()

# 7. Средний час транзакции
df_transactions['hour'] = df_transactions['date_time'].dt.hour
df_client['avg_hour'] = df_transactions.groupby('client_num')['hour'].mean()

# 8. Средний день недели
df_transactions['day_of_week'] = df_transactions['date_time'].dt.dayofweek
df_client['avg_day_of_week'] = df_transactions.groupby('client_num')['day_of_week'].mean()

# 9. Средний месяц транзакций
df_transactions['month'] = df_transactions['date_time'].dt.month
df_client['avg_month'] = df_transactions.groupby('client_num')['month'].mean()

# 10. Количество уникальных мерчантов, с которыми работает клиент
df_client['unique_merchants'] = df_transactions.groupby('client_num')['merchant_name'].nunique()

# 11. Сумма транзакций по категориям (MCC кодам)
df_mcc = df_transactions.groupby(['client_num', 'mcc_code'])['amount'].sum().unstack().reset_index()
# Заменяем NaN на 0
df_mcc = df_mcc.fillna(0)
df_client = df_client.merge(df_mcc, on='client_num', how='left')

# 12. Добавление целевой переменной из df_target
df_client = df_client.merge(df_target[['client_num', 'target']], on='client_num', how='left')

# 13. Дополнительно, можно рассчитать изменяющиеся признаки, например, количество транзакций за последние 7 дней
# Пример:
df_transactions['date'] = df_transactions['date_time'].dt.date
df_transactions['prev_date'] = df_transactions.groupby('client_num')['date'].shift(1)

# Проверим, что prev_date не содержит пустых значений
df_transactions['days_since_last_transaction'] = (pd.to_datetime(df_transactions['date']) - pd.to_datetime(df_transactions['prev_date'])).dt.days

# Добавим этот признак в df_client
df_client['avg_days_since_last_transaction'] = df_transactions.groupby('client_num')['days_since_last_transaction'].mean()


# Проверка результата
df_client.head()


Unnamed: 0,client_num,total_amount,mean_transaction,max_transaction,min_transaction,transaction_count,avg_hour,avg_day_of_week,avg_month,unique_merchants,...,8931,8999,9222,9311,9390,9399,9402,9406,target,avg_days_since_last_transaction
0,0,106935,810.113636,7322,28,132,15.469697,2.969697,8.348485,46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.564885
1,1,863878,3599.491667,100000,6,240,15.270833,3.075,7.925,106,...,0.0,0.0,772.0,0.0,0.0,0.0,0.0,0.0,4.0,0.380753
2,2,344108,1147.026667,24496,23,300,14.016667,3.233333,7.89,82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.304348
3,3,1621825,11032.823129,1000000,1,147,12.197279,3.142857,8.034014,47,...,0.0,0.0,6434.0,13000.0,0.0,0.0,0.0,0.0,3.0,0.623288
4,4,199796,1637.672131,50000,24,122,17.008197,2.516393,7.836066,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.743802


In [103]:
df_client.shape

(109143, 332)

In [104]:
df_client.isna().sum()

client_num                             0
total_amount                           0
mean_transaction                       0
max_transaction                        0
min_transaction                        0
                                   ...  
9399                                   0
9402                                   0
9406                                   0
target                             39143
avg_days_since_last_transaction        0
Length: 332, dtype: int64

In [129]:
hihi = df_client['target'].value_counts().sum()
class_counts = df_client['target'].value_counts().to_dict()
weight = {cls:count/hihi  for cls, count in class_counts.items()}
sample_weights = [class_counts[label] for label in y_train]

In [131]:
df_final = df_client[df_client['target'].isna() == False]

In [133]:
df_final.isna().sum().sum()

0

In [234]:
X = df_final.drop(['target'], axis=1)  # Удаляем ненужные столбцы
y = df_final['target']

In [236]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [238]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [240]:
import xgboost as xgb

In [242]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

# grid = {
#     'max_depth' : np.arange(1,10),
#     'learning_rate': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
#     'n_estimators': [100,200],
#     'reg_alpha':[0, 0.1, 1, 10],
#     'reg_lambda': [0, 0.1, 1, 10]
# }


model = xgb.XGBClassifier(
    objective='multi:softmax',  # Многоуровневая классификация
    num_class=8,                # 8 классов (target от 0 до 7)
    eval_metric='merror',       # Метрика для многоуровневой классификации
    random_state=42
)


# grid_model = GridSearchCV(model, param_grid=grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
# Обучение модели
model.fit(X_train, y_train)

In [244]:
model.feature_importances_

array([0.00355318, 0.0145229 , 0.00722006, 0.0042685 , 0.00511972,
       0.0168094 , 0.00369776, 0.00358752, 0.00781159, 0.00392406,
       0.0039953 , 0.00281049, 0.00443561, 0.0020338 , 0.00255325,
       0.00285724, 0.        , 0.        , 0.0020336 , 0.        ,
       0.00384365, 0.00489712, 0.00191164, 0.        , 0.        ,
       0.00468342, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00211081, 0.        , 0.        , 0.        ,
       0.        , 0.00161614, 0.00170728, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00153233, 0.        , 0.        , 0.0046462 , 0.00414334,
       0.00317699, 0.00446253, 0.0040659 , 0.        , 0.00470783,
       0.00430101, 0.00364863, 0.00393916, 0.00279548, 0.00248567,
       0.0023775 , 0.        , 0.00306397, 0.00241859, 0.00396079,
       0.00406103, 0.00440735, 0.00378871, 0.00522067, 0.00412

In [184]:
y_pred = model.predict(X_test)

# Оценка модели
# WMAE = Weighted Mean Absolute Error (используем веса классов)
weights = y_test.value_counts(normalize=True)  # Веса классов (нормализуем, чтобы сумма была 1)
wmae = np.sum(weights * np.abs(y_pred - y_test))
print(wmae)

0.07807142857142857


In [186]:
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error

In [188]:
accuracy_score(y_test,y_pred) 

0.3296428571428571

In [190]:
mean_absolute_error(y_test,y_pred) 

1.4047857142857143

In [192]:
wmae = sum(weight[y_test.iloc[i]] * abs(y_test.iloc[i] - y_pred[i]) for i in range(len(y_test))) / sum(weight[y_test.iloc[i]] for i in range(len(y_test)))
print("WMAE:", wmae)

WMAE: 1.1409690539690402


In [194]:
X

Unnamed: 0,total_amount,mean_transaction,max_transaction,min_transaction,transaction_count,avg_hour,avg_day_of_week,avg_month,unique_merchants,0742,...,8911,8931,8999,9222,9311,9390,9399,9402,9406,avg_days_since_last_transaction
1,863878,3599.491667,100000,6,240,15.270833,3.075000,7.925000,106,0.0,...,0.0,0.0,0.0,772.0,0.0,0.0,0.0,0.0,0.0,0.380753
2,344108,1147.026667,24496,23,300,14.016667,3.233333,7.890000,82,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.304348
3,1621825,11032.823129,1000000,1,147,12.197279,3.142857,8.034014,47,0.0,...,0.0,0.0,0.0,6434.0,13000.0,0.0,0.0,0.0,0.0,0.623288
4,199796,1637.672131,50000,24,122,17.008197,2.516393,7.836066,26,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.743802
5,67359,391.622093,10000,22,172,8.819767,2.877907,7.645349,34,0.0,...,0.0,0.0,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.520468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109136,19377,1291.800000,5190,55,15,16.400000,3.266667,7.533333,12,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.500000
109138,236283,14767.687500,59255,1,16,9.125000,1.000000,8.500000,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.133333
109139,9640,642.666667,1150,25,15,13.333333,2.333333,7.666667,11,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.214286
109141,61843,3865.187500,22360,170,16,14.000000,2.937500,7.937500,14,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.733333


In [196]:
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Многоуровневая классификация
    num_class=8,                # 8 классов (target от 0 до 7)
    eval_metric='merror',       # Метрика для многоуровневой классификации
    random_state=42
)

# Обучение модели
model.fit(X, y)

In [198]:
mean_absolute_error(y, model.predict(X))

0.9426571428571429

In [206]:
df_client_notarg = df_client[df_client['target'].isna()].drop('target', axis = 1)
X = df_client_notarg.drop('client_num', axis = 1)

In [208]:
df_client_notarg.shape

(39143, 331)

In [212]:
pred = model.predict(X)

In [214]:
df_client_notarg['target'] = pred

In [216]:
df_client_notarg.head()


Unnamed: 0,client_num,total_amount,mean_transaction,max_transaction,min_transaction,transaction_count,avg_hour,avg_day_of_week,avg_month,unique_merchants,...,8931,8999,9222,9311,9390,9399,9402,9406,avg_days_since_last_transaction,target
0,0,106935,810.113636,7322,28,132,15.469697,2.969697,8.348485,46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.564885,0
10,10,3020981,6152.710794,358958,3,491,15.040733,3.189409,7.798371,189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17551,6
11,11,200840,509.746193,20000,1,394,12.799492,2.601523,7.80203,43,...,0.0,3299.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21374,0
14,14,711788,1216.731624,40000,1,585,11.464957,2.781197,7.921368,197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155822,5
16,16,117194,492.411765,12000,18,238,13.483193,2.768908,7.60084,32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379747,0


In [218]:
df_client_notarg[['client_num', 'target']].to_csv('third_try.csv', index=False)

In [None]:
y_test

In [None]:
random_array = np.random.randint(0, 8, size=14000)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import torch.nn as nn

In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [6]:
df_client.to_csv('clients.csv', index=False)

In [39]:
df_client.corr()['unique_merchants'][df_client.corr()['target'] > 0.29]

unique_merchants              1.000000
target                        0.296298
unique_merchants_per_month    0.852788
unique_mcc_count              0.909440
Name: unique_merchants, dtype: float64

In [15]:
import torch
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

In [19]:
X = X.to_numpy()  # или X.values, если у вас старая версия pandas
y = y.to_numpy()



AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [17]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long) 

In [21]:
X.shape

(70000, 331)

In [27]:
import torch
import torch.nn as nn

input_dim = 331  # размерность признаков
seq_len = 1  # длина последовательности для каждого клиента (1 шаг времени)
hidden_dim = 64  # размер скрытого слоя
output_dim = len(np.unique(y))  # количество классов в целевой переменной

# Параметры модели трансформера
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)  # Преобразование входа в скрытое пространство
        self.transformer = nn.Transformer(d_model=hidden_dim, nhead=8, num_encoder_layers=6)
        self.fc_out = nn.Linear(hidden_dim, output_dim)  # Для классификации

    def forward(self, x):
        # Входные данные имеют форму (batch_size, seq_len, input_dim)
        # Нам нужно преобразовать их в (seq_len, batch_size, input_dim) для трансформера
        x = x.unsqueeze(0)  # добавляем размерность для последовательности (1 временной шаг)
        
        # Преобразование входных данных в скрытое пространство
        x = self.embedding(x)
        
        # Пропуск через трансформер
        x = self.transformer(x, x)
        
        # Извлекаем выход из последнего шага
        x = x.squeeze(0)  # Убираем размерность последовательности (1 шаг)
        
        # Классификация
        x = self.fc_out(x)
        return x

model = TransformerModel(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

# Определение функции потерь и оптимизатора
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(device)







In [None]:
epochs = 10
batch_size = 64
def evaluate(model, X_val, y_val, criterion):
    model.eval()
    with torch.no_grad():
        outputs = model(X_val)
        loss = criterion(outputs, y_val)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y_val).sum().item() / y_val.size(0)
    return loss.item(), accuracy

# Обучение модели
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for i in range(0, len(X_train_tensor), batch_size):
        # Создаем батч данных
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]

        # Обнуляем градиенты
        optimizer.zero_grad()

        # Прямой проход
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Обратный проход
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Оценка точности
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

    # Выводим информацию по эпохам
    train_accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(X_train_tensor):.4f}, Accuracy: {train_accuracy:.2f}%')

    # Оценка на валидации после каждой эпохи
    val_loss, val_accuracy = evaluate(model, X_val_tensor, y_val_tensor, criterion)
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%')


In [220]:
import catboost
from catboost import CatBoostClassifier

In [222]:
model = CatBoostClassifier(
    learning_rate=1e-2,  # Скорость обучения
    loss_function='MultiClass',  # Функция потерь для многоклассовой классификации
    cat_features=[],  # Если у вас есть категориальные признаки, добавьте их индексы
    verbose=100       # Частота вывода информации
)

In [224]:
model.fit(X_train, y_train)

0:	learn: 1.9418737	total: 235ms	remaining: 3m 55s
100:	learn: 1.7621528	total: 10.7s	remaining: 1m 34s
200:	learn: 1.7164593	total: 20.5s	remaining: 1m 21s
300:	learn: 1.6967884	total: 29.8s	remaining: 1m 9s
400:	learn: 1.6845989	total: 38.6s	remaining: 57.7s
500:	learn: 1.6759254	total: 47.5s	remaining: 47.3s
600:	learn: 1.6690030	total: 56.5s	remaining: 37.5s
700:	learn: 1.6635122	total: 1m 5s	remaining: 27.8s
800:	learn: 1.6583317	total: 1m 13s	remaining: 18.4s
900:	learn: 1.6535506	total: 1m 22s	remaining: 9.1s
999:	learn: 1.6493149	total: 1m 31s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x15f03a210>

In [226]:
y_pred = model.predict(X_test).squeeze()

# Оценка модели
# WMAE = Weighted Mean Absolute Error (используем веса классов)
weights = y_test.value_counts(normalize=True)  # Веса классов (нормализуем, чтобы сумма была 1)
wmae = np.sum(weights * np.abs(y_pred - y_test))
print(wmae)

0.07807142857142857


In [228]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test,y_pred) 

0.329

In [230]:
y_pred.shape

(14000,)

In [232]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [103]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

model = xgb.XGBClassifier(
    n_jobs = -1,
    objective='multi:softmax',  # Многоуровневая классификация
    num_class=8,                # 8 классов (target от 0 до 7)
    eval_metric='merror',       # Метрика для многоуровневой классификации
    random_state=42
)
grid = {'max_depth':np.arange(1,14)}
grid_cv = GridSearchCV(model , param_grid = grid, cv = 5, verbose=1, scoring='accuracy')
grid_cv.fit(X_train, y_train)


Fitting 5 folds for each of 13 candidates, totalling 65 fits


In [105]:

y_pred = grid_cv.predict(X_test)
# Оценка модели
# WMAE = Weighted Mean Absolute Error (используем веса классов)
weights = y_test.value_counts(normalize=True)  # Веса классов (нормализуем, чтобы сумма была 1)
wmae = np.sum(weights * np.abs(y_pred - y_test))
print(wmae)

0.07807142857142857


In [107]:
accuracy_score(y_test,y_pred) 

0.3299285714285714

In [None]:
pd.DataFrame(