In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [4]:
df = pd.read_csv('grant_data_imb.csv')
df.head()

Unnamed: 0,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,...,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1
0,0,97A,30B,A,321024.0,50.0,321013.0,30.0,291502.0,20.0,...,2563.0,25.0,Yes,>10 to 15,2.0,6.0,3.0,5.0,15.0,3.0
1,0,36D,10A,G,300201.0,100.0,0.0,0.0,0.0,0.0,...,1038.0,1.0,,Less than 0,0.0,3.0,0.0,4.0,0.0,0.0
2,0,317A,30D,,321013.0,100.0,0.0,0.0,0.0,0.0,...,2763.0,25.0,Yes,>5 to 10,4.0,3.0,6.0,25.0,14.0,14.0
3,0,62B,10B,B,321103.0,30.0,321105.0,40.0,321204.0,30.0,...,2848.0,25.0,,Less than 0,1.0,2.0,1.0,0.0,0.0,0.0
4,0,1A,10A,,270603.0,60.0,321205.0,30.0,320603.0,10.0,...,2678.0,25.0,Yes,>5 to 10,5.0,14.0,0.0,9.0,7.0,0.0


In [5]:
df.columns = [col.lower().replace('.', '_') for col in df.columns]

In [6]:
cat_features = df.select_dtypes(include=['object']).columns
for col in cat_features:
    df[col] = df[col].fillna('missing')

In [7]:
num_features = df.select_dtypes(include=['float64', 'int64']).columns.drop('grant_status')
for col in num_features:
    df[col] = df[col].fillna(df[col].median())

In [8]:
X = df.drop('grant_status', axis=1)
y = df['grant_status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

Получаем индексы категориальных признаков

In [9]:
cat_features_indices = [X.columns.get_loc(col) for col in cat_features]

In [17]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    cat_features=cat_features_indices,
    random_seed=42,
    scale_pos_weight=(len(y) - sum(y)) / sum(y)
)

In [19]:
model.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	test: 0.9117109	best: 0.9117109 (0)	total: 27.3ms	remaining: 27.3s
1:	test: 0.9234591	best: 0.9234591 (1)	total: 49.1ms	remaining: 24.5s
2:	test: 0.9200103	best: 0.9234591 (1)	total: 74.4ms	remaining: 24.7s
3:	test: 0.9180838	best: 0.9234591 (1)	total: 97.9ms	remaining: 24.4s
4:	test: 0.9222952	best: 0.9234591 (1)	total: 119ms	remaining: 23.7s
5:	test: 0.9284387	best: 0.9284387 (5)	total: 139ms	remaining: 23s
6:	test: 0.9304684	best: 0.9304684 (6)	total: 159ms	remaining: 22.5s
7:	test: 0.9326902	best: 0.9326902 (7)	total: 175ms	remaining: 21.8s
8:	test: 0.9323806	best: 0.9326902 (7)	total: 194ms	remaining: 21.4s
9:	test: 0.9331288	best: 0.9331288 (9)	total: 212ms	remaining: 21s
10:	test: 0.9343042	best: 0.9343042 (10)	total: 231ms	remaining: 20.7s
11:	test: 0.9361361	best: 0.9361361 (11)	total: 243ms	remaining: 20s
12:	test: 0.9363597	best: 0.9363597 (12)	total: 259ms	remaining: 19.7s
13:	test: 0.9374032	best: 0.9374032 (13)	total: 279ms	remaining: 19.7s
14:	test: 0.9373717	best: 0.

<catboost.core.CatBoostClassifier at 0x1f35a976490>

In [22]:
print(f"ROC-AUC: {roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])}")

ROC-AUC: 0.9523163809414597


In [26]:
feature_importance = model.get_feature_importance()
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}).sort_values(by='Importance', ascending=False)

In [27]:
importance_df.head(10)

Unnamed: 0,Feature,Importance
2,contract_value_band___see_note_a,19.128569
33,number_of_unsuccessful_grant_1,16.69625
32,number_of_successful_grant_1,12.749457
0,sponsor_code,10.448711
1,grant_category_code,8.390486
31,no__of_years_in_uni_at_time_of_grant_1,2.673342
3,rfcd_code_1,2.369192
23,person_id_1,2.249204
25,year_of_birth_1,2.068835
35,a_1,2.049073


__Вывод:__

Значение ROC-AUC на валидационной выборке получилось 0.9523163809414597, что намного лучше результатов обучения случайного леса, бустинга и логистической регрессии