In [118]:
import utils as ut
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tqdm import tqdm
from time import time
from sklearn.metrics import classification_report, roc_auc_score

pd.options.mode.chained_assignment = None

In [119]:
train = pd.read_csv('./data/to_train.csv')
test = pd.read_csv('./data/to_test.csv')


## Обучение модели на тестовых данных и проверка на валидационных:

In [120]:
# # Разделяем датасет на X, y
X, y = ut.split_to_x_y(train, 'gender')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

X_val.drop(columns=['id', 'Time'], inplace=True)
X_train.drop(columns=['id', 'Time'], inplace=True)

In [121]:
s = time()
# XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_val)
ut.print_classif_report('XGBClassifier', xgb_pred, y_val)
print(time() - s, "секунд модель обучалась")

XGBClassifier: 
- roc_auc_score: 0.9565
- accuracy_score: 0.9565
- f1_score: 0.9557

5.302814245223999 секунд модель обучалась


## Тестирование модели на отложенных данных:

In [122]:
test.head()

Unnamed: 0,mcc_code,amount,gender,id,Day,Time,Mean_net_money_per_transaction,Std_net_money_per_transaction,Mean_spend_money_per_transaction,Mean_earn_money_per_transaction,...,trans_desc_is_man,moto_cat,some_building_materials,computers,mens_relaxation,mens_clothes,womens_clothes,womens_beauty,womens_hobby,womens_moment
0,5411,0.0718,1,5da2eabcd7a953e750a8f77a3f296df6,333,23,0.1755,-0.6323,0.2854,-0.2333,...,0,0,0,0,0,0,0,0,0,0
1,6011,0.073,1,4f7485a708eedc86aa4f22b76fe47724,367,0,0.3179,0.05316,0.2197,-0.138,...,1,0,0,0,0,0,0,0,0,0
2,6010,0.0978,1,ef6db1e716daa37f342ff1c4866545ea,31,rare,0.1982,-0.586,0.267,-0.519,...,0,0,0,0,0,0,0,0,0,0
3,5411,0.01631,1,5fa2ed6c577aed609f6dc4fdb9d0b20a,66,23,-0.3005,-0.436,0.2001,-0.4202,...,0,0,0,0,0,0,0,0,0,0
4,6011,-0.10846,0,b4f418f44bea86b48f534aafddf199c2,360,13,0.1142,-0.7173,0.286,-0.6133,...,0,0,0,0,0,0,0,0,0,0


In [123]:
# # Разделяем датасет на X, y
X, y = ut.split_to_x_y(test, 'gender')

X.drop(columns=['Time'], inplace=True)
y['id'] = X['id']
X.drop(columns=['id'], inplace=True)

# Датафрейм с гендером уникальный пользователей:
y_test = y.groupby(['id'])['gender'].unique().apply(lambda x: x[0])

### Сначала тест не для уникального пользователя, а для каждой транзакции пользователя:

In [124]:
predict = xgb_model.predict(X)

In [125]:
print(classification_report(y_true=y.gender, y_pred=predict))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1043
           1       0.96      0.95      0.95       957

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [126]:
### Предикт гендера для уникального пользователя:

In [127]:
predict = pd.DataFrame(predict)
# Добавим id пользователей
predict['id'] = y['id']
predict.index = predict.id
predict = predict.drop(columns='id')
predict = predict.rename(columns={0: 'y'})

### Получим предикт гендера для уникального пользователя. Просто возьмем тот прогноз, значений которого больше:

In [128]:
# Получаем нужный файл
user_id = y_test.index
res = pd.DataFrame()
res['id'] = 0
res['gender'] = 0

for i, cur_id in tqdm(enumerate(user_id)):
    df = predict.loc[cur_id]
    try:
        true_gen = df['y'].mode().iat[0]
    except:
        true_gen = df['y']
    res.loc[i, 'id'] = cur_id
    res.loc[i, 'gender'] = true_gen

res.index = res.id
res = res.drop(columns='id')
# ut.plot_feature_importance(xgb_model.feature_importances_, X_train.columns, 'XGBoost')

1549it [00:01, 1165.15it/s]


In [129]:
### Проверим, что индексы совпадают

In [130]:
real = pd.DataFrame(y_test).sort_values(by='id')
res = res.sort_values(by='id')
all(real.index == res.index)

True

In [131]:
print(classification_report(y_pred=res, y_true=real))
print('ROC_AUC:', roc_auc_score(y_score=res, y_true=real))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       825
           1       0.95      0.93      0.94       724

    accuracy                           0.95      1549
   macro avg       0.95      0.95      0.95      1549
weighted avg       0.95      0.95      0.95      1549

ROC_AUC: 0.9456387075171605
