In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Чтение данных
df_transactions = pd.read_parquet('df_transaction.pa')
df_target = pd.read_parquet('train.pa')

In [98]:
df_transactions = df_transactions.merge(df_target[['target', 'client_num']], on = 'client_num', how='left')

In [99]:
df_transactions.head()

Unnamed: 0,client_num,date_time,mcc_code,merchant_name,amount,target
0,0,2024-07-18 16:04:00,8099,a011100358d0f73ea8f3e860ef5564e3ba9cb217b7b90c...,2900,
1,0,2024-07-22 16:31:00,5411,f3855606fc7244ec2f37ea01a4b2b66933d0e965bf4aec...,455,
2,0,2024-07-24 16:23:00,5541,786270fa33ad4ac2a3c0e52e888005aa7f98beadbf8986...,1003,
3,0,2024-07-28 15:51:00,5691,54887ad4a8df7e260a3ac85e59128a947c50d4423f6330...,1480,
4,0,2024-07-28 18:00:00,5331,21617559a372c7cca155208c87be6c84ce97b5f8775589...,88,


In [100]:
df_transactions['mcc_code'].astype(int).describe()

count    1.350816e+07
mean     5.507148e+03
std      8.178145e+02
min      7.420000e+02
25%      5.411000e+03
50%      5.451000e+03
75%      5.814000e+03
max      9.406000e+03
Name: mcc_code, dtype: float64

In [101]:
import pandas as pd
import numpy as np


# Преобразуем столбцы в нужные типы
df_transactions['date_time'] = pd.to_datetime(df_transactions['date_time'])
df_transactions['amount'] = df_transactions['amount'].astype(float)

# Добавляем признаки, связанные с временем
df_transactions['hour'] = df_transactions['date_time'].dt.hour
df_transactions['day_of_week'] = df_transactions['date_time'].dt.dayofweek
df_transactions['month'] = df_transactions['date_time'].dt.month

# Признаки, связанные с транзакциями
df_transactions['mean_amount'] = df_transactions.groupby('client_num')['amount'].transform('mean')
df_transactions['max_amount'] = df_transactions.groupby('client_num')['amount'].transform('max')
df_transactions['min_amount'] = df_transactions.groupby('client_num')['amount'].transform('min')
df_transactions['unique_merchants'] = df_transactions.groupby('client_num')['merchant_name'].transform('nunique')

# Целевой энкодинг для mcc_code
mcc_target_encoding = df_transactions.groupby('mcc_code')['target'].mean().to_dict()
df_transactions['mcc_encoded'] = df_transactions['mcc_code'].map(mcc_target_encoding)

# Создаем DataFrame для клиентов
df_client = df_transactions.groupby('client_num').agg({
    'amount': ['sum', 'mean', 'max', 'min'],
    'hour': 'mean',
    'day_of_week': 'mean',
    'month': 'mean',
    'unique_merchants': 'first',
    'mcc_encoded': 'mean',
    'target': 'mean'
}).reset_index()

# Убираем мультииндекс
df_client.columns = ['client_num', 'total_amount', 'mean_amount', 'max_amount', 'min_amount', 
                     'avg_hour', 'avg_day_of_week', 'avg_month', 'unique_merchants', 
                     'mcc_encoded_mean', 'target']

# Добавляем дополнительные признаки из второго блока
df_client['transaction_count'] = df_transactions.groupby('client_num')['amount'].count().values
df_client['avg_days_since_last_transaction'] = (
    df_transactions.groupby('client_num')['date_time'].diff().dt.days.groupby(df_transactions['client_num']).mean().values
)

# Добавляем сумму транзакций по категориям (MCC кодам)
df_mcc = df_transactions.groupby(['client_num', 'mcc_code'])['amount'].sum().unstack().fillna(0).reset_index()
df_client = df_client.merge(df_mcc, on='client_num', how='left')

# Проверка результата
df_client.head()

Unnamed: 0,client_num,total_amount,mean_amount,max_amount,min_amount,avg_hour,avg_day_of_week,avg_month,unique_merchants,mcc_encoded_mean,...,8734,8911,8931,8999,9222,9311,9390,9399,9402,9406
0,0,106935.0,810.113636,7322.0,28.0,15.469697,2.969697,8.348485,46,2.759526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,863878.0,3599.491667,100000.0,6.0,15.270833,3.075,7.925,106,2.87665,...,0.0,0.0,0.0,0.0,772.0,0.0,0.0,0.0,0.0,0.0
2,2,344108.0,1147.026667,24496.0,23.0,14.016667,3.233333,7.89,82,2.757642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1621825.0,11032.823129,1000000.0,1.0,12.197279,3.142857,8.034014,47,2.937373,...,0.0,0.0,0.0,0.0,6434.0,13000.0,0.0,0.0,0.0,0.0
4,4,199796.0,1637.672131,50000.0,24.0,17.008197,2.516393,7.836066,26,2.778026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_client.isna().sum().sum()

39143

In [232]:
df_client.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109143 entries, 0 to 109142
Columns: 333 entries, client_num to 9406
dtypes: float64(330), int64(3)
memory usage: 277.3 MB


In [7]:
hihi = df_client['target'].value_counts().sum()

In [8]:

class_counts = df_client['target'].value_counts().to_dict()
weight = {cls:count/hihi  for cls, count in class_counts.items()}

In [106]:
df_final = df_client[df_client['target'].isna() == False]

In [108]:
df_client.head()

Unnamed: 0,client_num,total_amount,mean_amount,max_amount,min_amount,avg_hour,avg_day_of_week,avg_month,unique_merchants,mcc_encoded_mean,...,8734,8911,8931,8999,9222,9311,9390,9399,9402,9406
0,0,106935.0,810.113636,7322.0,28.0,15.469697,2.969697,8.348485,46,2.759526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,863878.0,3599.491667,100000.0,6.0,15.270833,3.075,7.925,106,2.87665,...,0.0,0.0,0.0,0.0,772.0,0.0,0.0,0.0,0.0,0.0
2,2,344108.0,1147.026667,24496.0,23.0,14.016667,3.233333,7.89,82,2.757642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1621825.0,11032.823129,1000000.0,1.0,12.197279,3.142857,8.034014,47,2.937373,...,0.0,0.0,0.0,0.0,6434.0,13000.0,0.0,0.0,0.0,0.0
4,4,199796.0,1637.672131,50000.0,24.0,17.008197,2.516393,7.836066,26,2.778026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [114]:


# Разделение данных
X = df_final.drop(['client_num', 'target'], axis = 1)
y = df_final['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [116]:
X

Unnamed: 0,total_amount,mean_amount,max_amount,min_amount,avg_hour,avg_day_of_week,avg_month,unique_merchants,mcc_encoded_mean,transaction_count,...,8734,8911,8931,8999,9222,9311,9390,9399,9402,9406
1,863878.0,3599.491667,100000.0,6.0,15.270833,3.075000,7.925000,106,2.876650,240,...,0.0,0.0,0.0,0.0,772.0,0.0,0.0,0.0,0.0,0.0
2,344108.0,1147.026667,24496.0,23.0,14.016667,3.233333,7.890000,82,2.757642,300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1621825.0,11032.823129,1000000.0,1.0,12.197279,3.142857,8.034014,47,2.937373,147,...,0.0,0.0,0.0,0.0,6434.0,13000.0,0.0,0.0,0.0,0.0
4,199796.0,1637.672131,50000.0,24.0,17.008197,2.516393,7.836066,26,2.778026,122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,67359.0,391.622093,10000.0,22.0,8.819767,2.877907,7.645349,34,2.441179,172,...,0.0,0.0,0.0,500.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109136,19377.0,1291.800000,5190.0,55.0,16.400000,3.266667,7.533333,12,2.615043,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109138,236283.0,14767.687500,59255.0,1.0,9.125000,1.000000,8.500000,5,2.908303,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109139,9640.0,642.666667,1150.0,25.0,13.333333,2.333333,7.666667,11,2.778948,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109141,61843.0,3865.187500,22360.0,170.0,14.000000,2.937500,7.937500,14,2.601407,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
X = scaler.fit_transform(X)

In [256]:

model = xgb.XGBClassifier(
    objective='multi:softmax',  # Многоуровневая классификация
    num_class=8,                # 8 классов (target от 0 до 7)
    eval_metric='merror',       # Метрика для многоуровневой классификации
    random_state=42,
    n_estimators = 1000
)

# Обучение модели
model.fit(X_train, y_train)

# Оценка модели
print("R^2:", model.score(X_test, y_test))

R^2: 0.31242857142857144


In [242]:
y_pred = model.predict(X_test)

In [243]:
y_pred = y_pred.astype(np.float32)

In [258]:

# Вычисление WMAE
wmae = sum(weight[y_test.iloc[i]] * abs(y_test.iloc[i] - y_pred[i]) for i in range(len(y_test))) / sum(weight[y_test.iloc[i]] for i in range(len(y_test)))
print("WMAE:", wmae)

WMAE: 1.7682260263155718


In [22]:
from sklearn.metrics import accuracy_score, mean_absolute_error

In [260]:
accuracy_score(y_test, model.predict(X_test))

0.31242857142857144

In [262]:
mean_absolute_error(y_test, model.predict(X_test))

1.429142857142857

In [20]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42, n_estimators=300)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
accuracy_score(y_test, model.predict(X_test))

0.33185714285714285

In [23]:
mean_absolute_error(y_test, model.predict(X_test))

1.4208571428571428

In [19]:
from catboost import CatBoostClassifier

In [49]:
clf = CatBoostClassifier(thread_count=-1,
                         random_seed=42,
                        iterations=3500,
                        )

clf.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    verbose=200,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100,
)

Learning rate set to 0.07252
0:	learn: 1.9153073	test: 1.9155120	best: 1.9155120 (0)	total: 182ms	remaining: 10m 36s
200:	learn: 1.6246855	test: 1.6706026	best: 1.6706026 (200)	total: 19.4s	remaining: 5m 18s
400:	learn: 1.5815775	test: 1.6627654	best: 1.6627654 (400)	total: 38.9s	remaining: 5m
600:	learn: 1.5511390	test: 1.6607310	best: 1.6606520 (583)	total: 56.8s	remaining: 4m 33s
800:	learn: 1.5245921	test: 1.6596381	best: 1.6596361 (799)	total: 1m 14s	remaining: 4m 11s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.659536403
bestIteration = 893

Shrink model to first 894 iterations.


<catboost.core.CatBoostClassifier at 0x17babbd90>

In [51]:
accuracy_score(y_test, clf.predict(X_test))

0.3410714285714286

In [31]:
clf.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,118,7.573023
1,6,7.401712
2,8,5.570599
3,1,5.096168
4,10,4.583892
...,...,...
326,285,0.000000
327,294,0.000000
328,317,0.000000
329,319,0.000000


In [33]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import KFold 

In [118]:
n_splits = 3  # Число фолдов. Оптимально 3, 5 или 10
clfs = [] # Тут будем сохранять модели
scores = [] # Тут будем хранить скоры валидационных фолдов

kf = KFold(n_splits=n_splits, shuffle=True, random_state=7575)
for train_index, test_index in kf.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Специальный класс для ускорения обучения
    train_dataset = Pool(data=X_train, label=y_train)
    eval_dataset = Pool(data=X_test, label=y_test)

    clf = CatBoostClassifier(
        iterations=3500,
        learning_rate=0.05,
        loss_function="MultiClass",  # MultiLogloss
        # eval_metric = 'Precision',  F1:macro / AUC:hints=skip_train~false
        custom_metric=["Accuracy"],  # 'AUC / Accuracy,

        # ignored_features = ignored_features,

        # Регуляризация и ускорение
        colsample_bylevel=0.098,
        subsample=0.95,
        l2_leaf_reg=9,
        min_data_in_leaf=243,
        max_bin=187,
        random_strength=1,

        # Параметры скорения
        task_type="CPU",
        thread_count=-1,
        bootstrap_type="Bernoulli",

        # Важное!
        random_seed=7575,
        auto_class_weights="SqrtBalanced",
        early_stopping_rounds=50)

    clfs.append(clf)

    clf.fit(
        train_dataset,
        eval_set=eval_dataset,
        verbose=500,
        use_best_model=True,
        plot=False)

    scores.append(np.mean([v for k, v in clf.best_score_["validation"].items() if "Recall" in k], dtype="float16"))
    # scores.append(clf.best_score_['validation']['MultiClass'])
    # clf.save_model("../tmp_data/cool_catboost_model_{}_deep".format(n))

assert len(clfs) == n_splits
print("mean Recall score --------->", np.mean(scores, dtype="float16") - np.std(scores, dtype="float16"))

0:	learn: 1.9364031	test: 1.9365260	best: 1.9365260 (0)	total: 29.7ms	remaining: 1m 43s
500:	learn: 1.7026250	test: 1.7354706	best: 1.7354706 (500)	total: 12.9s	remaining: 1m 17s
1000:	learn: 1.6741397	test: 1.7305117	best: 1.7304867 (995)	total: 26s	remaining: 1m 4s
1500:	learn: 1.6539617	test: 1.7288778	best: 1.7288695 (1499)	total: 38.5s	remaining: 51.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.728690501
bestIteration = 1599

Shrink model to first 1600 iterations.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0:	learn: 1.9346899	test: 1.9347919	best: 1.9347919 (0)	total: 33.5ms	remaining: 1m 57s
500:	learn: 1.7049453	test: 1.7334302	best: 1.7334127 (498)	total: 12.5s	remaining: 1m 14s
1000:	learn: 1.6770408	test: 1.7275899	best: 1.7275899 (1000)	total: 25.2s	remaining: 1m 3s
1500:	learn: 1.6556093	test: 1.7252858	best: 1.7252858 (1500)	total: 38.5s	remaining: 51.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.724813979
bestIteration = 1744

Shrink model to first 1745 iterations.
0:	learn: 1.9332306	test: 1.9332694	best: 1.9332694 (0)	total: 26.6ms	remaining: 1m 32s
500:	learn: 1.7044006	test: 1.7340893	best: 1.7340893 (500)	total: 12.6s	remaining: 1m 15s
1000:	learn: 1.6762689	test: 1.7282249	best: 1.7281910 (994)	total: 25.3s	remaining: 1m 3s
1500:	learn: 1.6554174	test: 1.7258963	best: 1.7258354 (1491)	total: 38s	remaining: 50.5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.72538138
bestIteration = 1676

Shrink model to first 1677 iterations.


In [120]:
clf = clfs[1]

In [83]:
accuracy_score(y_test, clf.predict(X_test))

0.21807142857142858

In [71]:
clfs

[<catboost.core.CatBoostClassifier at 0x1845ecb90>,
 <catboost.core.CatBoostClassifier at 0x18461cd90>,
 <catboost.core.CatBoostClassifier at 0x184516610>]

In [264]:
from sklearn.model_selection import GridSearchCV

import random

In [324]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [432]:
X = scaler.fit_transform(X)

In [504]:
y

1         4.0
2         5.0
3         3.0
4         5.0
5         2.0
         ... 
109136    3.0
109138    2.0
109139    0.0
109141    0.0
109142    0.0
Name: target, Length: 70000, dtype: float64

In [524]:
import lightgbm as lgb
from sklearn.model_selection import KFold, cross_val_score
train_data = lgb.Dataset(X_train, label=y_train)

# Определение параметров модели
params = {
    'objective': 'multiclass',  # Для задачи классификации с несколькими классами
    'num_class': 7,             # Количество классов
    'boosting_type': 'gbdt',    # Тип бустинга (градиентный бустинг)
    'metric': 'multi_logloss',  # Метрика для оценки
    'num_leaves': 31,           # Максимальное количество листьев в дереве
    'learning_rate': 0.05,      # Скорость обучения
    'feature_fraction': 0.9     # Доля признаков, используемых для каждого дерева
}
model = lgb.train(params, train_data, num_boost_round=100)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42695
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 256
[LightGBM] [Info] Start training from score -1.537865
[LightGBM] [Info] Start training from score -1.356805
[LightGBM] [Info] Start training from score -1.959882
[LightGBM] [Info] Start training from score -2.165934
[LightGBM] [Info] Start training from score -2.163289
[LightGBM] [Info] Start training from score -2.539438
[LightGBM] [Info] Start training from score -2.547162


In [525]:
predict = np.argmax(model.predict(X_test), axis =1)

In [526]:
accuracy_score(y_test, predict)

0.3395714285714286

In [444]:

model_6 = CatBoostClassifier(random_state=228)

model_6.fit(X, y)

Learning rate set to 0.098271
0:	learn: 1.9093110	total: 184ms	remaining: 3m 3s
1:	learn: 1.8781515	total: 331ms	remaining: 2m 45s
2:	learn: 1.8506229	total: 482ms	remaining: 2m 40s
3:	learn: 1.8294475	total: 625ms	remaining: 2m 35s
4:	learn: 1.8112713	total: 761ms	remaining: 2m 31s
5:	learn: 1.7956161	total: 906ms	remaining: 2m 30s
6:	learn: 1.7827895	total: 1.05s	remaining: 2m 29s
7:	learn: 1.7723121	total: 1.2s	remaining: 2m 28s
8:	learn: 1.7620170	total: 1.33s	remaining: 2m 26s
9:	learn: 1.7544381	total: 1.46s	remaining: 2m 24s
10:	learn: 1.7477404	total: 1.58s	remaining: 2m 21s
11:	learn: 1.7408116	total: 1.72s	remaining: 2m 21s
12:	learn: 1.7360767	total: 1.86s	remaining: 2m 21s
13:	learn: 1.7317206	total: 1.99s	remaining: 2m 20s
14:	learn: 1.7268939	total: 2.12s	remaining: 2m 19s
15:	learn: 1.7228191	total: 2.26s	remaining: 2m 18s
16:	learn: 1.7188062	total: 2.4s	remaining: 2m 18s
17:	learn: 1.7153999	total: 2.53s	remaining: 2m 17s
18:	learn: 1.7129146	total: 2.65s	remaining: 2m

<catboost.core.CatBoostClassifier at 0x194624f90>

In [None]:
# model_1 LGBM
# model_2 LGBM
# model_3 LGBM
# model_4 Cat
# model_5 Cat

In [350]:
probabilities = model_1.predict(X_test)
labels_1 = np.argmax(probabilities, axis=1)
probabilities = model_2.predict(X_test)
labels_2 = np.argmax(probabilities, axis=1)
probabilities = model_3.predict(X_test)
labels_3 = np.argmax(probabilities, axis=1)

In [352]:
predictions = [
    labels_1,
    labels_2,
    labels_3,
    model_4.predict(X_test).squeeze(),
    model_5.predict(X_test).squeeze()
]
final_predictions = []
for i in range(len(X_test)):
    votes = [pred[i] for pred in predictions]
    final_predictions.append(max(set(votes), key=votes.count))

# Оценка точности
accuracy = accuracy_score(y_test, final_predictions)
print("Точность жесткого голосования:", accuracy)

Точность жесткого голосования: 0.43257142857142855


In [342]:
probabilities = model_2.predict(X_test)
labels = np.argmax(probabilities, axis=1)

In [348]:
model_3.predict(X_test)

array([[0.05934093, 0.05618893, 0.09238443, ..., 0.1749005 , 0.19075265,
        0.21193392],
       [0.37209167, 0.25788615, 0.11820519, ..., 0.06618853, 0.04329109,
        0.06111785],
       [0.28034986, 0.19306499, 0.15574932, ..., 0.07406408, 0.08035197,
        0.13188955],
       ...,
       [0.1542287 , 0.1907856 , 0.19955086, ..., 0.11744975, 0.07258142,
        0.05148487],
       [0.27328021, 0.53488897, 0.10797629, ..., 0.02102638, 0.01558523,
        0.00843231],
       [0.06633449, 0.12413763, 0.15468006, ..., 0.26222479, 0.08498252,
        0.11328225]])

In [270]:
accuracy_score(y_test, model.predict(X_test))

0.3415

In [170]:
mean_absolute_error(y_test, model.predict(X_test))

1.3852857142857142

In [50]:
import h2o
from h2o.automl import H2OAutoML

In [54]:
h2o.init()
h2o_data = h2o.H2OFrame(df_final)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16.1" 2022-08-12 LTS; OpenJDK Runtime Environment Microsoft-40648 (build 11.0.16.1+1-LTS); OpenJDK 64-Bit Server VM Microsoft-40648 (build 11.0.16.1+1-LTS, mixed mode)
  Starting server from /opt/anaconda3/lib/python3.11/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/5s/994f51mx1297jxm2p0w67qs00000gn/T/tmp3tangtp7
  JVM stdout: /var/folders/5s/994f51mx1297jxm2p0w67qs00000gn/T/tmp3tangtp7/h2o_ivanmosagin_started_from_python.out
  JVM stderr: /var/folders/5s/994f51mx1297jxm2p0w67qs00000gn/T/tmp3tangtp7/h2o_ivanmosagin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 9 days
H2O_cluster_name:,H2O_from_python_ivanmosagin_bbriyx
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [78]:
df_client_notarg = df_client[df_client['target'].isna()].drop('target', axis = 1)
leader = aml.leader

In [80]:
test_data = h2o.H2OFrame(df_client_notarg)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [82]:
predictions = leader.predict(test_data)

# Вывод предсказаний
print("Предсказания:", predictions)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Предсказания:   predict
 2.47704
 5.12899
 1.67741
 4.48111
 0.919653
 3.29261
 2.45883
 3.95415
 2.10664
 3.71894
[39143 rows x 1 column]



In [84]:
predictions

predict
2.47704
5.12899
1.67741
4.48111
0.919653
3.29261
2.45883
3.95415
2.10664
3.71894


In [190]:

model = CatBoostClassifier(random_state=42)
model.fit(X, y)

Learning rate set to 0.098271
0:	learn: 1.9045235	total: 224ms	remaining: 3m 43s
1:	learn: 1.8733302	total: 385ms	remaining: 3m 11s
2:	learn: 1.8470814	total: 540ms	remaining: 2m 59s
3:	learn: 1.8271349	total: 676ms	remaining: 2m 48s
4:	learn: 1.8099908	total: 816ms	remaining: 2m 42s
5:	learn: 1.7950290	total: 943ms	remaining: 2m 36s
6:	learn: 1.7825880	total: 1.08s	remaining: 2m 33s
7:	learn: 1.7715342	total: 1.22s	remaining: 2m 31s
8:	learn: 1.7627129	total: 1.36s	remaining: 2m 30s
9:	learn: 1.7540408	total: 1.49s	remaining: 2m 27s
10:	learn: 1.7471846	total: 1.63s	remaining: 2m 26s
11:	learn: 1.7409923	total: 1.75s	remaining: 2m 24s
12:	learn: 1.7365032	total: 1.89s	remaining: 2m 23s
13:	learn: 1.7311161	total: 2.03s	remaining: 2m 22s
14:	learn: 1.7267332	total: 2.16s	remaining: 2m 21s
15:	learn: 1.7228073	total: 2.29s	remaining: 2m 20s
16:	learn: 1.7191943	total: 2.42s	remaining: 2m 20s
17:	learn: 1.7155367	total: 2.55s	remaining: 2m 19s
18:	learn: 1.7125066	total: 2.68s	remaining:

<catboost.core.CatBoostClassifier at 0x1b9609b10>

In [132]:
df_client_notarg = df_client[df_client['target'].isna()].drop('target', axis = 1)
X = df_client_notarg.drop('client_num', axis = 1)
df_client_notarg.shape

(39143, 332)

In [59]:
len(X)

39143

In [448]:
perdict = pd.read_csv('perdict.csv')

In [450]:
per = perdict['target'].values

In [452]:
len(per)

39143

In [470]:
probabilities = model_1.predict(X)
labels_1 = np.argmax(probabilities, axis=1)
probabilities = model_2.predict(X)
labels_2 = np.argmax(probabilities, axis=1)
probabilities = model_3.predict(X)
labels_3 = np.argmax(probabilities, axis=1)
predictions = [
    per,
    labels_1,
    model_4.predict(X).squeeze()
]

In [472]:
len(probabilities)

39143

In [474]:
final_predictions = []
for i in range(len(X)):
    votes = [pred[i] for pred in predictions]
    final_predictions.append(max(set(votes), key=votes.count))


In [476]:
len(final_predictions)

39143

In [478]:
df_client_notarg.shape

(39143, 333)

In [532]:
final_predictions = model.predict(X)

In [538]:
final_predictions = np.argmax(final_predictions, axis = 1)

In [134]:
final_predictions = clf.predict(X)

In [136]:
list(final_predictions)

[array([5.]),
 array([6.]),
 array([1.]),
 array([5.]),
 array([0.]),
 array([4.]),
 array([3.]),
 array([5.]),
 array([3.]),
 array([4.]),
 array([1.]),
 array([6.]),
 array([1.]),
 array([5.]),
 array([0.]),
 array([1.]),
 array([6.]),
 array([3.]),
 array([1.]),
 array([1.]),
 array([4.]),
 array([4.]),
 array([1.]),
 array([1.]),
 array([5.]),
 array([4.]),
 array([0.]),
 array([4.]),
 array([2.]),
 array([1.]),
 array([2.]),
 array([6.]),
 array([0.]),
 array([4.]),
 array([3.]),
 array([3.]),
 array([3.]),
 array([6.]),
 array([4.]),
 array([1.]),
 array([2.]),
 array([2.]),
 array([5.]),
 array([4.]),
 array([3.]),
 array([3.]),
 array([4.]),
 array([5.]),
 array([4.]),
 array([5.]),
 array([6.]),
 array([5.]),
 array([1.]),
 array([2.]),
 array([4.]),
 array([4.]),
 array([5.]),
 array([4.]),
 array([1.]),
 array([1.]),
 array([5.]),
 array([4.]),
 array([1.]),
 array([6.]),
 array([3.]),
 array([4.]),
 array([6.]),
 array([4.]),
 array([3.]),
 array([1.]),
 array([6.]),
 array

In [138]:
df_client_notarg['target'] = final_predictions

In [140]:
df_client_notarg[['client_num', 'target']]

Unnamed: 0,client_num,target
0,0,5.0
10,10,6.0
11,11,1.0
14,14,5.0
16,16,0.0
...,...,...
109127,109127,0.0
109128,109128,0.0
109130,109130,1.0
109137,109137,0.0


In [142]:
df_client_notarg[['client_num', 'target']].to_csv('prob_pred.csv', index=False)

In [204]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42, n_estimators= 200)
# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Оценка модели
scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

ValueError: Found input variables with inconsistent numbers of samples: [39143, 70000]

In [208]:
# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Оценка модели
scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

In [210]:
scores

array([0.32830357, 0.32508929, 0.33214286, 0.33464286, 0.32875   ])

In [212]:
model.fit(X,y)