<h2 align="center" style="color:Black"> Companion  Геоаналитика </h2>


<b>Решение:</b>
1. Получаем матрицу признаков на активности клиентов в локациях
2. Используем кросс валидацию <b>MultilabelStratifiedKFold</b>
3. Для загрузки решения обучаем и сохраняем 7 моделей <b>XGBClassifier</b>
4. Валидация <b>9.48</b>, Паблик <b>9.27</b>

In [42]:
from pathlib import Path
import pandas as pd
import h3
import numpy as np
from tqdm import tqdm
import json
import joblib
from typing import List
from xgboost import XGBClassifier
from statistics import mean
import warnings
warnings.filterwarnings('ignore')

In [43]:
# метрика контеста
def mean_binary_cross_entropy(predictuion, target):
    eps = 1e-8
    mbce = (-np.log(np.clip(predictuion, eps, 1 - eps)) * target \
           - np.log(np.clip(1 - predictuion, eps, 1 - eps)) * (1 - target)).sum(axis=1).mean()
    return mbce

In [44]:
data_root = ''
hexses_target_path = 'hexses_target.lst'
hexses_data_path = 'hexses_data.lst'

train_data_fn   = 'transactions.parquet'
train_target_fn =  'target.parquet'

In [45]:
with open(hexses_target_path, "r") as f:
    hexses_target = [x.strip() for x in f.readlines()]

In [46]:
with open(hexses_data_path, "r") as f:
    hexses_data = [x.strip() for x in f.readlines()]

In [47]:
transactions = pd.read_parquet(train_data_fn)

In [48]:
data_coordinates = []
for location in tqdm(hexses_data):
    lat, lng = h3.h3_to_geo(location)
    data_coordinates.append([location,lat, lng])
coordinates_df = pd.DataFrame(data_coordinates,columns=['h3_09','lat','lng'])

100%|███████████████████████████████████| 8154/8154 [00:00<00:00, 535317.35it/s]


In [49]:
transactions = transactions.merge(coordinates_df,how='left',on='h3_09')

In [50]:
hexses_target_dict = {}
for location in tqdm(hexses_target):
    hexses_target_dict[location] = h3.h3_to_geo(location)

100%|██████████████████████████████████| 1657/1657 [00:00<00:00, 1412593.85it/s]


In [51]:
# Список локаций которых нет в транзакциях
loc_out = [x for x in hexses_target if x not in hexses_data]
loc_out

['8911aa4c2a3ffff', '8911aa63473ffff', '8911aa79c23ffff']

In [52]:
target = (
    pd.read_parquet(train_target_fn)
    .assign(customer_id = lambda x: x.customer_id.astype(int))
    .pipe(lambda x: pd.pivot(x.assign(v = 1.), index='customer_id', columns='h3_09', values='v'))
    .pipe(lambda x: x.reindex(sorted(x.columns), axis=1)) # Сортируем столбцы по порядку
    .sort_values(by='customer_id') # сортируем строки
    .fillna(0)
)
target.shape

(69337, 1657)

In [53]:
"""
Эвристика основана на том, что чем чаще клиент совершает транзакции в локации тем больше вероятность,
что он снимет наличность.

"""
class Personal():
   
    def __init__(self, probability: float):
        self.probability = probability
        

    def fit_predict(self, transactions: pd.DataFrame, hexses_target: List[np.array]):
        
        customer_list = []
        for customer_id, data in tqdm(transactions.groupby("customer_id")):
            location = data.h3_09.tolist()
            loc_text = []
            loc_feature = []
            for loc in hexses_target:
                if loc in location:
                    loc_count = location.count(loc)# Получаем вероятность отшошением кол-ва активностей на общую активность клиента
                    loc_feature.append((loc_count/len(location)))
                    loc_text.append(loc)
                else:
                    loc_feature.append(self.probability)#если нет активности константной вероятностью
                    loc_text.append(loc)
            
            customer_list.append([customer_id]+loc_feature)
        return pd.DataFrame(customer_list, columns = ['customer_id']+loc_text)

In [54]:
model = Personal(probability = 0.00110555)
customer_df = model.fit_predict(transactions, hexses_target)

100%|███████████████████████████████████| 69337/69337 [00:39<00:00, 1745.80it/s]


In [55]:
print(f'MBCE: {mean_binary_cross_entropy(customer_df[target.columns],target)}')

MBCE: 9.972874883352398


In [58]:
def count_transform(df: pd.DataFrame,  hexses_data: List[np.ndarray]) -> pd.DataFrame:
    """
    Создание матрицы признаков на активности в каждой локации если активности нет заполняется -1

    """

    data_features = []
    for customer_id, data in tqdm(df.groupby("customer_id")):
    
        location = data.h3_09.tolist()
        loc_feature =[]
        loc_text = []
     
        for loc in hexses_data:
            
            if loc in location:
                loc_count = data[data.h3_09==loc]['count'].sum()# суммируем все активности в локации
                loc_feature.append(loc_count)
                loc_text.append(loc+'_count') 
            else:
                loc_feature.append(-1)
                loc_text.append(loc+'_count')

        data_features.append([customer_id]+loc_feature)
    return pd.DataFrame(data_features, columns = ['customer_id']+loc_text)

In [59]:
train_data = count_transform(transactions,hexses_data)

100%|████████████████████████████████████| 69337/69337 [06:48<00:00, 169.85it/s]


In [61]:
from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold, MultilabelStratifiedKFold
from sklearn.metrics import log_loss

train_scores=[]
val_scores = []

MBCE_train_scores=[]
MBCE_val_scores = []

rmskf = MultilabelStratifiedKFold(n_splits=7, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(rmskf.split(train_data, target)):
    print(f'Fold: {fold}')
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[test_index]
    y_train, y_val = target.iloc[train_index], target.iloc[test_index]

    model = XGBClassifier(n_estimators=400,depth=4,nthread = -1,sampling_method='gradient_based',
                      objective="binary:logistic",tree_method='gpu_hist')
   
    model.fit(X_train, y_train,eval_set=[(X_val, y_val)],
          verbose=5,early_stopping_rounds=5)
    joblib.dump(model, f'model_{fold}.pkl')
  
            
    train_preds = model.predict_proba(X_train)
    valid_preds = model.predict_proba(X_val) 
            
    train_score = log_loss(y_train, train_preds)
    val_score = log_loss(y_val, valid_preds)
    
    print(f'Результат log_loss на трейн: {train_score}')
    print(f'Результат log_loss на тесте: {val_score}')
    print('----------->')
    print(f'Результат MBCE на трейн: {mean_binary_cross_entropy(train_preds,y_train)}')
    print(f'Результат MBCE на тесте: {mean_binary_cross_entropy(valid_preds,y_val)}')
    
            
    train_scores.append(train_score)
    val_scores.append(val_score)
            
    MBCE_train_scores.append(mean_binary_cross_entropy(train_preds,y_train))
    MBCE_val_scores.append(mean_binary_cross_entropy(valid_preds,y_val))

Fold: 0
TRAIN: [    0     1     2 ... 69333 69334 69336] TEST: [    4     8    13 ... 69321 69331 69335]
[0]	validation_0-logloss:0.09535
[5]	validation_0-logloss:0.02387
[10]	validation_0-logloss:0.00922
[15]	validation_0-logloss:0.00624
[20]	validation_0-logloss:0.00572
[25]	validation_0-logloss:0.00569
[27]	validation_0-logloss:0.00571
Результат log_loss на трейн: 5.633122159449459
Результат log_loss на тесте: 9.166002289811422
----------->
Результат MBCE на трейн: 4.8513161005218
Результат MBCE на тесте: 9.41882829917683
Fold: 1
TRAIN: [    0     1     2 ... 69333 69335 69336] TEST: [    9    24    27 ... 69326 69327 69334]
[0]	validation_0-logloss:0.09537
[5]	validation_0-logloss:0.02389
[10]	validation_0-logloss:0.00926
[15]	validation_0-logloss:0.00630
[20]	validation_0-logloss:0.00579
[25]	validation_0-logloss:0.00576
[28]	validation_0-logloss:0.00579
Результат log_loss на трейн: 5.613165365026098
Результат log_loss на тесте: 9.289679481901196
----------->
Результат MBCE на тре

In [62]:
print(f'MBCE на валидации: {sum(MBCE_val_scores)/len(MBCE_val_scores)}')

MBCE на валидации: 9.48433945469394
