In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [38]:
# 1. Загрузка данных
transactions = pd.read_csv('transactions.csv', sep=',').fillna(0)
transactions

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-2245.92,0
1,39026145,1 10:19:29,6011,7010,56147.89,0
2,39026145,1 10:20:56,4829,2330,-56147.89,0
3,39026145,1 10:39:54,5499,1010,-1392.47,0
4,39026145,2 15:33:42,5499,1010,-920.83,0
...,...,...,...,...,...,...
6849341,61870738,453 16:03:02,5499,1010,-5176.84,10217113
6849342,61870738,454 10:54:60,5411,1010,-1652.77,022915
6849343,61870738,454 14:23:59,5499,1010,-4687.23,10217113
6849344,61870738,454 16:11:53,5541,1110,-4491.83,RU570124


In [39]:
gender_train = pd.read_csv('gender_train.csv', sep=',').fillna(0)
gender_train

Unnamed: 0,customer_id,gender
0,10928546,1
1,69348468,1
2,61009479,0
3,74045822,0
4,27979606,1
...,...,...
8395,90417572,0
8396,66837341,0
8397,10758984,1
8398,11376556,0


In [40]:
tr_mcc_codes = pd.read_csv('tr_mcc_codes.csv', sep=';').fillna(0)
tr_mcc_codes

Unnamed: 0,mcc_code,mcc_description
0,742,Ветеринарные услуги
1,1711,"Генеральные подрядчики по вентиляции, теплосна..."
2,1731,Подрядчики по электричеству
3,1799,"Подрядчики, специализированная торговля — нигд..."
4,2741,Разнообразные издательства/печатное дело
...,...,...
179,9211,"Судовые выплаты, включая алименты и детскую по..."
180,9222,Штрафы
181,9311,Налоговые платежи
182,9399,"Правительственные услуги, нигде ранее не класс..."


In [41]:
tr_types = pd.read_csv('tr_types.csv', sep=';').fillna(0)
tr_types

Unnamed: 0,tr_type,tr_description
0,3200,Плата за предоставление услуг посредством моби...
1,3210,Плата за предоставление отчета по счету карты ...
2,3800,Плата за обслуживание банковской карты (за пер...
3,4000,Плата за получение наличных в Сбербанке
4,4001,Плата за получение наличных в Сбербанке (в дру...
...,...,...
150,2990,Списание по требованию
151,2992,Списание средств для погашения задолженности п...
152,3001,Комиссия за обслуживание ссудного счета
153,3030,Плата за обслуживание банковской карты (за пос...


In [42]:
# 2. Обработка данных и создание признаков
transactions['tr_hour'] = transactions['tr_datetime'].str.split().str[1].str.split(':').str[0].astype(int)

In [43]:
# Группировка - базовые статистики по клиенту:
def create_features(df):
    features = pd.DataFrame()

    # Общее количество транзакций
    features['transaction_count'] = df.groupby('customer_id')['amount'].count()
    features['sum_amount'] = df.groupby('customer_id')['amount'].sum()
    features['mean_amount'] = df.groupby('customer_id')['amount'].mean()
    
    # Приходные транзакции (amount > 0)
    income = df[df['amount'] > 0]
    features['mean_income_amount'] = income.groupby('customer_id')['amount'].mean().fillna(0)
    features['min_income_amount'] = income.groupby('customer_id')['amount'].min().fillna(0)
    features['max_income_amount'] = income.groupby('customer_id')['amount'].max().fillna(0)
    
    # Расходные транзакции (amount < 0)
    expense = df[df['amount'] < 0]
    features['mean_expense_amount'] = expense.groupby('customer_id')['amount'].mean().fillna(0)
    features['min_expense_amount'] = expense.groupby('customer_id')['amount'].min().fillna(0)
    features['max_expense_amount'] = expense.groupby('customer_id')['amount'].max().fillna(0)
    
    # Уникальные mcc и терминалы
    features['unique_mcc_count'] = df.groupby('customer_id')['mcc_code'].nunique()
    features['unique_term_count'] = df.groupby('customer_id')['tr_type'].nunique()
    
    # Признаки по часам транзакций
    features['mean_tr_hour'] = df.groupby('customer_id')['tr_hour'].mean()
    
    # Средний час прихода
    features['mean_income_tr_hour'] = income.groupby('customer_id')['tr_hour'].mean().fillna(0)
    
    # Средний час расхода
    features['mean_expense_tr_hour'] = expense.groupby('customer_id')['tr_hour'].mean().fillna(0)
    
    return features

features = create_features(transactions)
features.head()

Unnamed: 0_level_0,transaction_count,sum_amount,mean_amount,mean_income_amount,min_income_amount,max_income_amount,mean_expense_amount,min_expense_amount,max_expense_amount,unique_mcc_count,unique_term_count,mean_tr_hour,mean_income_tr_hour,mean_expense_tr_hour
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6815,226,-1249454.87,-5528.561372,451092.183333,224.59,2470507.35,-17981.854409,-224591.58,-22.46,6,6,12.699115,12.833333,12.695455
22899,234,-629796.59,-2691.438419,85572.397794,88.04,875907.15,-38847.708675,-868607.92,-88.04,11,21,12.837607,13.544118,12.548193
27914,111,-2977026.82,-26820.061441,11710.848571,1122.96,89836.63,-44564.559474,-462658.65,-1122.96,6,7,11.486486,11.457143,11.5
28753,294,-26546282.86,-90293.479116,914148.1794,7860.71,3099363.77,-296121.687828,-3244539.76,-2245.92,30,15,6.503401,7.26,6.348361
31385,365,-2180003.58,-5972.612548,54405.845652,2245.92,199886.5,-10033.152135,-364366.15,-16.84,17,17,14.550685,9.217391,14.909357


In [44]:
# 3.1. Анализ MCC-кодов по полу
trans_train = transactions.merge(gender_train, on='customer_id', how='inner')
trans_train

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id,tr_hour,gender
0,39026145,0 10:23:26,4814,1030,-2245.92,0,10,1
1,39026145,1 10:19:29,6011,7010,56147.89,0,10,1
2,39026145,1 10:20:56,4829,2330,-56147.89,0,10,1
3,39026145,1 10:39:54,5499,1010,-1392.47,0,10,1
4,39026145,2 15:33:42,5499,1010,-920.83,0,15,1
...,...,...,...,...,...,...,...,...
3751078,61870738,453 16:03:02,5499,1010,-5176.84,10217113,16,0
3751079,61870738,454 10:54:60,5411,1010,-1652.77,022915,10,0
3751080,61870738,454 14:23:59,5499,1010,-4687.23,10217113,14,0
3751081,61870738,454 16:11:53,5541,1110,-4491.83,RU570124,16,0


In [45]:
import pandas as pd
import numpy as np

epsilon = 1e-6  # небольшой стабилизирующий коэффициент

# Функция для подсчёта ratio и выбора топов по столбцу (column_name)
def get_top_gender_ratio_features(df, column_name, top_n=20, filter_ratio=0.1):
    # Считаем количество уникальных клиентов мужчин и женщин для каждого значения колонки
    counts = df.groupby([column_name, 'gender'])['tr_datetime'].count().unstack(fill_value=0)

    # Добавляем столбцы с 0, если отсутствуют
    if 0 not in counts.columns:
        counts[0] = 0
    if 1 not in counts.columns:
        counts[1] = 0

    # Считаем общее число транзакций по признаку (мужчины + женщины)
    counts['total'] = counts[0] + counts[1]

    # Отфильтровываем, оставляем только верхние (1 - filter_ratio) по популярности
    popular_threshold = counts['total'].quantile(filter_ratio)
    counts_filtered = counts[counts['total'] > popular_threshold]

    
    # Считаем ratio (% мужчин)
    counts_filtered['male_female_ratio'] = (counts_filtered[1]) / (counts_filtered[1] + counts_filtered[0])
    
    counts_sorted = counts_filtered.sort_values('male_female_ratio')
    print(counts_sorted)
    # Получаем топ N самых "женских" (минимальный ratio)
    top_female = counts_sorted.head(top_n).index.tolist()
    # Топ N самых "мужских" (максимальный ratio)
    top_male = counts_sorted.tail(top_n).index.tolist()
    
    return top_female, top_male


# Получаем топ-20 MCC кодов для женщин и мужчин
top_mcc_female, top_mcc_male = get_top_gender_ratio_features(trans_train, 'mcc_code', top_n=25, filter_ratio=0.6)

# Получаем топ-20 tr_type для женщин и мужчин
top_trtype_female, top_trtype_male = get_top_gender_ratio_features(trans_train, 'tr_type', top_n=15, filter_ratio=0.6)

print("Топ-20 'женских' MCC кодов:", top_mcc_female)
print("Топ-20 'мужских' MCC кодов:", top_mcc_male)

print("Топ-20 'женских' tr_type:", top_trtype_female)
print("Топ-20 'мужских' tr_type:", top_trtype_male)

### Весело, но это работает хуже чем взять просто самые популярные коды
top_mcc_gender = pd.Index(top_mcc_male).union(top_mcc_female)
top_trtype_gender = pd.Index(top_trtype_male).union(top_trtype_female)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_filtered['male_female_ratio'] = (counts_filtered[1]) / (counts_filtered[1] + counts_filtered[0])


gender        0      1  total  male_female_ratio
mcc_code                                        
5621       3750    755   4505           0.167592
5631       1411    325   1736           0.187212
5977      16056   4125  20181           0.204400
5651       6395   2435   8830           0.275764
5699       4321   1740   6061           0.287081
...         ...    ...    ...                ...
6051        692   2545   3237           0.786222
7538        249   1058   1307           0.809487
5533       2121   9694  11815           0.820482
7994        726   4769   5495           0.867880
7995       1754  11665  13419           0.869290

[74 rows x 4 columns]
gender        0       1   total  male_female_ratio
tr_type                                           
2320       1680     935    2615           0.357553
7030      46716   30375   77091           0.394015
2020       4483    3012    7495           0.401868
7020       4890    3362    8252           0.407416
1030     310032  229538  539570   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_filtered['male_female_ratio'] = (counts_filtered[1]) / (counts_filtered[1] + counts_filtered[0])


In [46]:
# 3.2. Добавляем в базу
mcc_counts = transactions[transactions['mcc_code'].isin(top_mcc_gender)].pivot_table(
    index='customer_id',
    columns='mcc_code',
    values='amount',
    aggfunc='count',
    fill_value=0
)
mcc_counts.columns = [f'mcc_{col}_count' for col in mcc_counts.columns]
mcc_counts

Unnamed: 0_level_0,mcc_3000_count,mcc_4121_count,mcc_4789_count,mcc_4812_count,mcc_4814_count,mcc_4816_count,mcc_5200_count,mcc_5211_count,mcc_5251_count,mcc_5261_count,...,mcc_7311_count,mcc_7523_count,mcc_7538_count,mcc_7832_count,mcc_7994_count,mcc_7995_count,mcc_7997_count,mcc_8021_count,mcc_8099_count,mcc_8999_count
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,0,0,0,0,90,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22899,0,0,0,0,47,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27914,0,0,0,0,58,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28753,0,0,0,0,25,0,1,2,0,0,...,0,0,0,0,0,0,0,10,1,2
31385,0,0,0,1,125,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99984336,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99985917,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99988578,0,2,0,1,135,0,0,2,0,0,...,7,0,0,0,4,0,0,0,0,31
99991245,0,0,0,0,25,0,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# 3.3. Аналогично по tr_type
tr_type_counts = transactions[transactions['tr_type'].isin(top_trtype_gender)].pivot_table(
    index='customer_id',
    columns='tr_type',
    values='amount',
    aggfunc='count',
    fill_value=0
)
tr_type_counts.columns = [f'tr_type_{col}_count' for col in tr_type_counts.columns]
tr_type_counts

Unnamed: 0_level_0,tr_type_1010_count,tr_type_1030_count,tr_type_1100_count,tr_type_1110_count,tr_type_1200_count,tr_type_1210_count,tr_type_2010_count,tr_type_2011_count,tr_type_2020_count,tr_type_2110_count,...,tr_type_4110_count,tr_type_6110_count,tr_type_7010_count,tr_type_7011_count,tr_type_7020_count,tr_type_7021_count,tr_type_7030_count,tr_type_7031_count,tr_type_7070_count,tr_type_7071_count
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,42,90,0,18,0,0,70,0,0,0,...,0,0,5,0,0,0,0,1,0,0
22899,6,47,0,27,1,0,48,0,0,1,...,1,0,40,0,0,0,12,1,9,3
27914,0,58,0,1,2,0,12,0,0,0,...,0,0,33,0,2,0,0,0,0,0
28753,89,25,3,65,0,0,24,4,1,0,...,0,1,5,0,0,0,29,0,14,0
31385,75,125,0,75,0,0,59,1,0,0,...,0,8,5,2,0,4,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99984336,0,0,0,3,0,0,39,0,0,0,...,0,0,16,0,0,0,0,0,2,0
99985917,1,1,0,1,0,0,13,0,0,0,...,0,0,3,0,1,0,0,0,0,0
99988578,215,135,19,82,0,0,112,5,0,19,...,19,0,4,0,1,0,4,2,47,49
99991245,73,25,0,9,0,0,43,0,8,3,...,3,0,35,0,0,0,3,0,0,0


In [52]:
# 3.4. Добавляем признаки в общий датафрейм признаков
if 'customer_id' in features.columns:
    features = features.set_index('customer_id')
features = features.join(mcc_counts).join(tr_type_counts)
data = features.merge(gender_train, on='customer_id', how='inner').fillna(0)
features.reset_index(inplace=True)
data

Unnamed: 0,customer_id,transaction_count,sum_amount,mean_amount,mean_income_amount,min_income_amount,max_income_amount,mean_expense_amount,min_expense_amount,max_expense_amount,...,tr_type_6110_count,tr_type_7010_count,tr_type_7011_count,tr_type_7020_count,tr_type_7021_count,tr_type_7030_count,tr_type_7031_count,tr_type_7070_count,tr_type_7071_count,gender
0,22899,234,-629796.59,-2691.438419,85572.397794,88.04,875907.15,-38847.708675,-868607.92,-88.04,...,0,40,0,0,0,12,1,9,3,1
1,28753,294,-26546282.86,-90293.479116,914148.179400,7860.71,3099363.77,-296121.687828,-3244539.76,-2245.92,...,1,5,0,0,0,29,0,14,0,0
2,42096,936,-5529425.48,-5907.505855,160823.312424,6737.75,673774.73,-18556.050690,-337785.73,-25.38,...,0,2,1,0,36,20,0,5,2,0
3,49793,513,-24529283.63,-47815.367700,24525.401200,1122.96,112295.79,-51521.349713,-673774.73,-157.21,...,0,12,0,0,0,0,0,13,0,1
4,50940,118,-1709747.26,-14489.383559,4491.830000,4491.83,4491.83,-16638.200189,-110049.87,-539.02,...,0,0,0,0,0,0,0,12,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8395,99917144,670,-9347074.29,-13950.857149,11550.425714,2245.92,22459.16,-14220.101463,-527790.21,-9.88,...,0,7,0,0,0,0,0,0,0,0
8396,99967537,13,-1846142.77,-142010.982308,0.000000,0.00,0.00,-142010.982308,-336887.37,-2245.92,...,0,0,0,0,0,0,0,0,0,1
8397,99984336,63,-8196354.85,-130100.870635,113169.200000,44918.32,804037.85,-227408.898889,-606397.26,-4491.83,...,0,16,0,0,0,0,0,2,0,1
8398,99988578,826,-11154687.43,-13504.464201,72100.194019,898.37,673774.73,-26243.961321,-808529.68,-67.38,...,0,4,0,1,0,4,2,47,49,1


In [30]:
# 4. Обучение модели

X = data.drop(columns=['gender'])
y = data['gender']

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
print(X)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

      customer_id  transaction_count   sum_amount    mean_amount  \
0           22899                234   -629796.59   -2691.438419   
1           28753                294 -26546282.86  -90293.479116   
2           42096                936  -5529425.48   -5907.505855   
3           49793                513 -24529283.63  -47815.367700   
4           50940                118  -1709747.26  -14489.383559   
...           ...                ...          ...            ...   
8395     99917144                670  -9347074.29  -13950.857149   
8396     99967537                 13  -1846142.77 -142010.982308   
8397     99984336                 63  -8196354.85 -130100.870635   
8398     99988578                826 -11154687.43  -13504.464201   
8399     99999680                410 -16858696.35  -41118.771585   

      mean_income_amount  min_income_amount  max_income_amount  \
0           85572.397794              88.04          875907.15   
1          914148.179400            7860.71        

In [15]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rfс_model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
rfс_model.fit(X_train, y_train)

y_pred_proba = rfс_model.predict_proba(X_valid)[:, 1]
print('ROC-AUC на валидации:', roc_auc_score(y_valid, y_pred_proba))

ROC-AUC на валидации: 0.8526481956819321


In [16]:
# CatBoost — градиентный бустинг от Яндекса
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.01,
    depth=10,
    eval_metric='AUC',
    random_seed=42,
    verbose=20,
    early_stopping_rounds=100
)

catboost_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True)

y_pred_proba = catboost_model.predict_proba(X_valid)[:, 1]
print('ROC-AUC на валидации CatBoost:', roc_auc_score(y_valid, y_pred_proba))

0:	test: 0.7434282	best: 0.7434282 (0)	total: 206ms	remaining: 6m 51s
20:	test: 0.8183595	best: 0.8188155 (19)	total: 1.38s	remaining: 2m 9s
40:	test: 0.8257159	best: 0.8257159 (40)	total: 2.5s	remaining: 1m 59s
60:	test: 0.8323492	best: 0.8323492 (60)	total: 3.6s	remaining: 1m 54s
80:	test: 0.8357175	best: 0.8357175 (80)	total: 4.72s	remaining: 1m 51s
100:	test: 0.8379051	best: 0.8379051 (100)	total: 5.77s	remaining: 1m 48s
120:	test: 0.8406745	best: 0.8406745 (120)	total: 6.89s	remaining: 1m 46s
140:	test: 0.8430661	best: 0.8430661 (140)	total: 8s	remaining: 1m 45s
160:	test: 0.8452393	best: 0.8452393 (160)	total: 9.11s	remaining: 1m 44s
180:	test: 0.8472689	best: 0.8472689 (180)	total: 10.2s	remaining: 1m 42s
200:	test: 0.8491793	best: 0.8491793 (200)	total: 11.3s	remaining: 1m 40s
220:	test: 0.8504606	best: 0.8504606 (220)	total: 12.4s	remaining: 1m 39s
240:	test: 0.8521728	best: 0.8521728 (240)	total: 13.4s	remaining: 1m 37s
260:	test: 0.8534124	best: 0.8534124 (260)	total: 14.5s	

In [17]:
# LightGBM — градиентный бустинг от Microsoft
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

lgbm_model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

lgbm_model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric='auc')

y_pred_proba = lgbm_model.predict_proba(X_valid)[:, 1]
print('ROC-AUC на валидации LightGBM sklearn API:', roc_auc_score(y_valid, y_pred_proba))

[LightGBM] [Info] Number of positive: 2970, number of negative: 3750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7916
[LightGBM] [Info] Number of data points in the train set: 6720, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.441964 -> initscore=-0.233194
[LightGBM] [Info] Start training from score -0.233194
ROC-AUC на валидации LightGBM sklearn API: 0.862802018411614


In [18]:
from sklearn.ensemble import VotingClassifier

# soft voting ансамбль
voting_clf = VotingClassifier(
    estimators=[
        ('rfс', rfс_model),
        ('catboost', catboost_model),
        ('lgbm', lgbm_model)
    ],
    voting='soft',
    n_jobs=-1
)
voting_clf.fit(X_train, y_train)

y_pred_proba = voting_clf.predict_proba(X_valid)[:, 1]
print('ROC-AUC на валидации Soft Voting:', roc_auc_score(y_valid, y_pred_proba))

ROC-AUC на валидации Soft Voting: 0.8669301958801536


In [19]:
from sklearn.ensemble import VotingClassifier

# hard voting ансамбль
voting_hard_clf = VotingClassifier(
    estimators=[
        ('rfс', rfс_model),
        ('catboost', catboost_model),
        ('lgbm', lgbm_model)
    ],
    voting='hard',
    n_jobs=-1
)
voting_hard_clf.fit(X_train, y_train)

y_pred_class = voting_hard_clf.predict(X_valid)
from sklearn.metrics import accuracy_score
print('Accuracy на валидации Hard Voting:', accuracy_score(y_valid, y_pred_class))

Accuracy на валидации Hard Voting: 0.7880952380952381


In [20]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class PseudoHardVotingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        """
        estimators: list of tuples [('name', model), ...]
        все модели должны поддерживать методы fit, predict, predict_proba
        """
        self.estimators = estimators
        self.named_estimators_ = dict(estimators)
        
    def fit(self, X, y):
        for name, est in self.estimators:
            est.fit(X, y)
        return self
    
    def predict_proba(self, X):
        # Получаем вероятности от всех моделей, shape (n_samples, n_models)
        probas = np.array([est.predict_proba(X)[:,1] for _, est in self.estimators]).T
        
        # Получаем классы по порогу 0.5
        classes = (probas >= 0.5).astype(int)
        
        # Голоса за класс 1
        votes_for_1 = classes.sum(axis=1)
        
        # Определяем класс большинства (>= половины моделей)
        majority_class = (votes_for_1 >= (len(self.estimators)/2)).astype(int)
        
        # Для каждого объекта считаем среднее вероятностей только моделей,
        # которые поддержали большинство
        pseudo_probs = []
        for i in range(probas.shape[0]):
            mask = (classes[i] == majority_class[i])
            avg_prob = probas[i, mask].mean()
            pseudo_probs.append(avg_prob)
        pseudo_probs = np.array(pseudo_probs)
        
        # Возвращаем вероятности для обоих классов: [1 - pseudo, pseudo]
        return np.vstack([1 - pseudo_probs, pseudo_probs]).T
    
    def predict(self, X):
        proba = self.predict_proba(X)[:,1]
        return (proba >= 0.5).astype(int)


# Предполагается, что rf_model, catboost_model, lgbm_model — инициализированы (не обучены)

pseudo_hard_voting_clf = PseudoHardVotingClassifier([
    ('rfc', rfс_model),
    ('catboost', catboost_model),
    ('lgbm', lgbm_model)
])

# Обучение
pseudo_hard_voting_clf.fit(X_train, y_train)

# Предсказания вероятностей на валидации
y_pred_proba = pseudo_hard_voting_clf.predict_proba(X_valid)[:,1]

from sklearn.metrics import roc_auc_score
print('ROC-AUC на валидации Pseudo-Hard Voting:', roc_auc_score(y_valid, y_pred_proba))

0:	total: 73.6ms	remaining: 2m 27s
20:	total: 1.59s	remaining: 2m 30s
40:	total: 3.07s	remaining: 2m 26s
60:	total: 4.59s	remaining: 2m 26s
80:	total: 6.13s	remaining: 2m 25s
100:	total: 7.71s	remaining: 2m 24s
120:	total: 9.18s	remaining: 2m 22s
140:	total: 10.7s	remaining: 2m 20s
160:	total: 12.1s	remaining: 2m 18s
180:	total: 13.6s	remaining: 2m 16s
200:	total: 15.1s	remaining: 2m 14s
220:	total: 16.5s	remaining: 2m 13s
240:	total: 18s	remaining: 2m 11s
260:	total: 19.5s	remaining: 2m 9s
280:	total: 21s	remaining: 2m 8s
300:	total: 22.6s	remaining: 2m 7s
320:	total: 24.3s	remaining: 2m 6s
340:	total: 25.8s	remaining: 2m 5s
360:	total: 27.3s	remaining: 2m 3s
380:	total: 28.8s	remaining: 2m 2s
400:	total: 30.3s	remaining: 2m
420:	total: 31.8s	remaining: 1m 59s
440:	total: 33.3s	remaining: 1m 57s
460:	total: 34.7s	remaining: 1m 56s
480:	total: 36.3s	remaining: 1m 54s
500:	total: 37.9s	remaining: 1m 53s
520:	total: 39.5s	remaining: 1m 52s
540:	total: 41s	remaining: 1m 50s
560:	total: 42

In [72]:
# 5. Предсказания для теста
gender_test = pd.read_csv('gender_test_kaggle_sample_submission.csv', sep=',')

# Признаки для тестовой выборки
gender_test['customer_id'] = gender_test['customer_id'].astype(data.index.dtype)
# Теперь фильтруем
if 'customer_id' in features.columns:
    features = features.set_index('customer_id')
test_feats = features.loc[features.index.isin(gender_test['customer_id'])].fillna(0)
test_feats.reset_index(inplace=True)

submission = gender_test.copy()

test_pred_proba_RFC = rfс_model.predict_proba(test_feats)[:, 1]
submission['RFC_prediction'] = test_pred_proba_RFC

test_pred_proba_CatBoost = catboost_model.predict_proba(test_feats)[:, 1]
submission['CatBoost_prediction'] = test_pred_proba_CatBoost

test_pred_proba_LightGBM = lgbm_model.predict_proba(test_feats)[:, 1]
submission['LightGBM_prediction'] = test_pred_proba_LightGBM

test_pred_proba_sv = voting_clf.predict_proba(test_feats)[:, 1]
submission['softvoting_prediction'] = test_pred_proba_sv

test_pred_proba_hv = voting_hard_clf.predict(test_feats)
submission['hardvoting_prediction'] = test_pred_proba_hv

test_pred_proba_hv = pseudo_hard_voting_clf.predict_proba(test_feats)[:, 1]
submission['pseudohardvoting_prediction'] = test_pred_proba_hv

submission.to_csv('gender_prediction_submission.csv', index=False)
print('Предсказания сохранены в gender_prediction_submission.csv')

Предсказания сохранены в gender_prediction_submission.csv
