In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime as dt
import boto3
from io import BytesIO
import pickle
import os
from dotenv import load_dotenv
import scipy
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import MinMaxScaler
import sklearn

plt.style.use('ggplot')


In [2]:
load_dotenv()

S3_BUCKET_NAME=os.getenv("S3_BUCKET_NAME")
AWS_ACCESS_KEY_ID=os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY=os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
# определение функций для работы с s3
def upload_to_s3(df, file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    
    with BytesIO() as buffer:
        df.to_parquet(buffer)
        buffer.seek(0)
        s3_client.upload_fileobj(buffer, S3_BUCKET_NAME, file_name)
    
    
def download_from_s3(file_name) -> pd.DataFrame:
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        s3_client.download_fileobj(S3_BUCKET_NAME, file_name, buffer)
        buffer.seek(0)
        return pd.read_parquet(buffer)
    
    
def upload_pickle_to_s3(object, file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        pickle.dump(object, buffer)
        buffer.seek(0)
        s3_client.upload_fileobj(buffer, S3_BUCKET_NAME, file_name)
        
def download_pickle_from_s3(file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        s3_client.download_fileobj(S3_BUCKET_NAME, file_name, buffer)
        buffer.seek(0)
        return pickle.load(buffer)

def create_interaction_matrix(data: pd.DataFrame):

    interaction_matrix = data[['client_id'] + target_cols]
    interaction_matrix = interaction_matrix.groupby('client_id').sum().reset_index()
    
    interaction_matrix['accounts'] = [[targets] for targets in interaction_matrix[target_cols].values]
    interaction_matrix['accounts'] = interaction_matrix['accounts'].apply(lambda x: x[0])
    interaction_matrix['accounts_name'] = [list(id_to_acc.keys())] * interaction_matrix.shape[0]
    
    interaction_matrix = interaction_matrix[['client_id', 'accounts', 'accounts_name']]
    
    interaction_matrix_exploded = list()
    interaction_matrix = interaction_matrix.values

    for row in interaction_matrix:
        accounts = row[1]
        accounts_name = row[2]
        client_id = row[0]
        for account, account_name in zip(accounts, accounts_name):
            interaction_matrix_exploded.append((client_id, account, account_name))

    interaction_matrix_exploded = pd.DataFrame(interaction_matrix_exploded, columns=['client_id', 'account', 'account_name'])
    interaction_matrix_exploded = interaction_matrix_exploded.rename(columns={'account': 'target'})
    interaction_matrix_exploded['target'] = (interaction_matrix_exploded['target'] > 0).astype(int)
    # interaction_matrix_exploded = interaction_matrix_exploded[interaction_matrix_exploded['account'] > 0]

    
    return interaction_matrix_exploded

## Выгрузка данных

In [4]:
data = download_from_s3('bank_products_processed.parquet')
split_date = '2016-01-28'

train_data = data[data['div_data'] <= split_date]
test_data = data[data['div_data'] > split_date]

target_cols = [col for col in train_data.columns if col.startswith('acc_')]

acc_to_id = {acc: id for id, acc in enumerate(target_cols)}
id_to_acc = {id: acc for id, acc in enumerate(target_cols)}

In [5]:
als_model = download_pickle_from_s3('als_model.pkl')
client_enc = download_pickle_from_s3('client_enc.pkl')
user_item_matrix_train = download_pickle_from_s3('user_item_matrix_train.pkl')


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Генерация фичей для второй модели

### Получение рекомендаций ALS как признаков для модели

In [19]:
user_item_matrix_train.shape[0]

703778

In [28]:
user_item_matrix_train[0]

<1x24 sparse matrix of type '<class 'numpy.float32'>'
	with 4 stored elements in Compressed Sparse Row format>

In [39]:
features_recommendations = als_model.recommend(list(range(user_item_matrix_train.shape[0])), user_item_matrix_train, filter_already_liked_items=True, N=24)

In [40]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = features_recommendations[0]
als_scores = features_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": list(range(user_item_matrix_train.shape[0])),
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = client_enc.inverse_transform(als_recommendations["user_id_enc"])
# als_recommendations["item_id"] = als_recommendations["item_id_enc"].map(id_to_acc)
als_recommendations = als_recommendations.drop(columns=["user_id_enc"])

In [41]:
upload_to_s3(als_recommendations, "als_recommendations.parquet")

### Агрегирование признаков в связке клиент-продукт

In [6]:
als_recommendations = download_from_s3('als_recommendations.parquet')

In [7]:
train_data.head().T

Unnamed: 0,0,1,2,3,4
age,35,23,23,22,23
work_expirience,6,35,35,35,35
income,87218.1,35548.74,122179.11,119775.54,101828.34
days_from_first_contract,3886,4771,4771,4771,4771
days_from_premium,,,,,
id,1,2,3,4,5
is_last_6m_contract,0,0,0,0,0
is_new_client,1,1,1,1,1
is_resident,1,1,1,1,1
has_bank_spouse,0,0,0,0,0


Как известно из EDA, следующие признаки оказывают наибольшее влияние на целевые переменные:  
* acc_salary
* is_KHE
* is_KAT
* vip_status
* is_capital
* is_active
* work_expirience
* days_from_first_contract
* income 

In [8]:
clients_features = train_data.sort_values(by='div_data', ascending=False).groupby('client_id', as_index=False).agg({
    'is_KHE': 'mean',
    'is_KAT': 'mean',
    'vip_status': 'last',
    'is_capital': 'last',
    'is_active': 'last',
    'work_expirience': 'last',
    'days_from_first_contract': 'last',
    'income': 'mean'
})

In [9]:
train_matrix = create_interaction_matrix(train_data)
test_matrix = create_interaction_matrix(test_data)

In [10]:
train_matrix.head()

Unnamed: 0,client_id,target,account_name
0,15889,0,0
1,15889,0,1
2,15889,1,2
3,15889,0,3
4,15889,0,4


In [11]:
als_recommendations = als_recommendations.merge(clients_features.rename(columns={'client_id': 'user_id'}), on='user_id', how='left')

In [12]:
train_matrix.rename(columns={'account_name': 'item_id_enc', 'client_id': 'user_id'}, inplace=True)
test_matrix.rename(columns={'account_name': 'item_id_enc', 'client_id': 'user_id'}, inplace=True)


In [13]:
als_recommendations = als_recommendations.merge(train_matrix, on=['user_id', 'item_id_enc'], how='left')

## Обучение модели

In [14]:
als_recommendations.head()

Unnamed: 0,item_id_enc,score,user_id,is_KHE,is_KAT,vip_status,is_capital,is_active,work_expirience,days_from_first_contract,income,target
0,7,0.001877,15889,0.0,1.0,1,1,1,245,11187,326124.9,0
1,13,0.001847,15889,0.0,1.0,1,1,1,245,11187,326124.9,0
2,15,0.001567,15889,0.0,1.0,1,1,1,245,11187,326124.9,0
3,3,0.001221,15889,0.0,1.0,1,1,1,245,11187,326124.9,0
4,1,0.000954,15889,0.0,1.0,1,1,1,245,11187,326124.9,0


In [15]:
cb_model = CatBoostClassifier(
    random_state=42,
    iterations=50
)

cb_model.fit(als_recommendations.drop(columns=['target', 'user_id', 'item_id_enc']), als_recommendations['target'])

Learning rate set to 0.5
0:	learn: 0.3133432	total: 1.03s	remaining: 50.4s
1:	learn: 0.1727958	total: 2.14s	remaining: 51.3s
2:	learn: 0.1007872	total: 3.35s	remaining: 52.4s
3:	learn: 0.0602077	total: 4.41s	remaining: 50.7s
4:	learn: 0.0372887	total: 5.27s	remaining: 47.5s
5:	learn: 0.0238787	total: 6.15s	remaining: 45.1s
6:	learn: 0.0158598	total: 7.07s	remaining: 43.5s
7:	learn: 0.0112826	total: 8.05s	remaining: 42.3s
8:	learn: 0.0086080	total: 8.93s	remaining: 40.7s
9:	learn: 0.0070502	total: 9.72s	remaining: 38.9s
10:	learn: 0.0061737	total: 10.6s	remaining: 37.6s
11:	learn: 0.0058169	total: 11.4s	remaining: 36.2s
12:	learn: 0.0054755	total: 12.4s	remaining: 35.3s
13:	learn: 0.0052865	total: 13.3s	remaining: 34.3s
14:	learn: 0.0051836	total: 14.5s	remaining: 33.8s
15:	learn: 0.0051456	total: 15.6s	remaining: 33.1s
16:	learn: 0.0050867	total: 17s	remaining: 32.9s
17:	learn: 0.0050395	total: 18.2s	remaining: 32.4s
18:	learn: 0.0050022	total: 19.1s	remaining: 31.1s
19:	learn: 0.00498

<catboost.core.CatBoostClassifier at 0x7fc4d80cc760>

In [49]:
for name, importance in zip(cb_model.feature_names_, np.round(cb_model.feature_importances_, 2)):
    print(f"{name}: {importance}")

score: 87.09
is_KHE: 1.98
is_KAT: 0.01
vip_status: 0.17
is_capital: 0.71
is_active: 3.88
work_expirience: 3.08
days_from_first_contract: 3.04
income: 0.05


In [16]:
upload_pickle_to_s3(cb_model, 'cb_model.pkl')

## Evaluation

In [17]:
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, accuracy_score

In [18]:
cb_model = download_pickle_from_s3('cb_model.pkl')

In [None]:
als_recommendations = als_recommendations.merge(train_matrix.rename(columns={'target': 'is_used'}), on=['user_id', 'item_id_enc'], how='left')
als_recommendations = als_recommendations[als_recommendations['is_used'] == 0] # оставим только те продукты, которые не покупались ранее

In [21]:
als_recommendations_test = als_recommendations.drop(columns=['target']).merge(test_matrix, on=['user_id', 'item_id_enc'], how='left')

In [22]:
predictions = cb_model.predict(als_recommendations_test.drop(columns=['target', 'user_id', 'item_id_enc']))
probas = cb_model.predict_proba(als_recommendations_test.drop(columns=['target', 'user_id', 'item_id_enc']))[:, 1]

In [23]:
als_recommendations_test['predictions'] = predictions
als_recommendations_test['probas'] = probas

upload_to_s3(als_recommendations_test, 'als_recommendations_test.parquet')

### Расчет метрик классификации

In [31]:
als_recommendations_test['target'].fillna(0, inplace=True)

In [32]:
roc_auc = roc_auc_score(als_recommendations_test['target'], probas)
print(f"ROC AUC: {roc_auc}")

ROC AUC: 0.8545509098765844


In [33]:
f1 = f1_score(als_recommendations_test['target'], predictions)
print(f"F1: {f1}")


F1: 0.0


### Расчет метрик рекомендаций

In [34]:
def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg

def compute_hit_rate(items: pd.Series, recommendations: pd.Series, k: int = 5):

    """ подсчёт hit rate
    items: истинные айтемы
    recommendations: рекоменуемые айтемы
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    hit_cnt = 0
    rate_cnt = 0
    
    for rec_item in recommendations:
        if rec_item in items:
            rate_cnt += 1
        hit_cnt += 1
        if hit_cnt >= k:
            break
    return rate_cnt / len(items)

def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    # events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common 

def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()

    return precision, recall 

In [37]:
users_coverage = len(als_recommendations_test['user_id'].unique()) / len(test_data['client_id'].unique())
items_coverage = len(als_recommendations_test['item_id_enc'].unique()) / len(target_cols)

print(f"Users coverage: {users_coverage}")
print(f"Items coverage: {items_coverage}")

Users coverage: 0.7514852922961274
Items coverage: 1.0


In [38]:
als_recommendations_test.head()

Unnamed: 0,item_id_enc,score,user_id,is_KHE,is_KAT,vip_status,is_capital,is_active,work_expirience,days_from_first_contract,income,is_used,target,predictions,probas
0,7,0.001877,15889,0.0,1.0,1,1,1,245,11187,326124.9,0,0.0,0,0.001427
1,13,0.001847,15889,0.0,1.0,1,1,1,245,11187,326124.9,0,0.0,0,0.001427
2,15,0.001567,15889,0.0,1.0,1,1,1,245,11187,326124.9,0,0.0,0,0.001272
3,3,0.001221,15889,0.0,1.0,1,1,1,245,11187,326124.9,0,0.0,0,0.000803
4,1,0.000954,15889,0.0,1.0,1,1,1,245,11187,326124.9,0,0.0,0,0.000988


In [None]:
rating_test_idx = ~(als_recommendations_test["target"] == 0)
ndcg_at_5_scores = als_recommendations_test[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["target"], x["probas"], k=5))
# ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating"], x["score"], k=5))

print(f"NDCG at 5: {ndcg_at_5_scores.mean()}")

NDCG at 5: 1.0


In [45]:
precision_5, recall_5 = compute_cls_metrics(process_events_recs_for_binary_metrics(
  None,
    test_matrix.rename(columns={"item_id_enc": "item_id"}), 
    als_recommendations_test.rename(columns={"item_id_enc": "item_id"}), 
    top_k=5) )


Common users: 697114


In [46]:
print(f"Precision at 5: {precision_5}, Recall at 5: {recall_5}")


Precision at 5: 1.0, Recall at 5: 0.20833333333333343


## Логирование в MlFlow

In [50]:
import mlflow

* 'schema_extra' has been renamed to 'json_schema_extra'


In [51]:
# константы для логирования в mlflow

EXPERIMENT_NAME = "final_project_bank_alexdem"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5020

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [52]:
RUN_NAME = "improved_cb_model_bank"
REGISTRY_MODEL_NAME = "improved_cb_model_bank_alexdem"

pip_requirements= "../requirements.txt"
signature = mlflow.models.infer_signature(als_recommendations.drop(columns=['target', 'user_id', 'item_id_enc']), predictions)
input_example = als_recommendations_test.drop(columns=['target', 'user_id', 'item_id_enc'])[:10]


experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    model_info = mlflow.sklearn.log_model( 
			      cb_model,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            pip_requirements=pip_requirements,
            signature=signature,
            input_example=input_example,
            await_registration_for=60
		)
    mlflow.log_metrics({'precision_5': precision_5, 'recall_5': recall_5, 'ndcg_5': ndcg_at_5_scores.mean(), "users_coverage": users_coverage, "items_coverage": items_coverage})
    mlflow.log_artifact("model_improvement.ipynb")


  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'improved_cb_model_bank_alexdem'.
2025/09/05 22:54:51 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: improved_cb_model_bank_alexdem, version 1
Created version '1' of model 'improved_cb_model_bank_alexdem'.
