In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime as dt
import boto3
from io import BytesIO
import pickle
import os
from dotenv import load_dotenv
import scipy
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import MinMaxScaler

plt.style.use('ggplot')



In [2]:
load_dotenv()

S3_BUCKET_NAME=os.getenv("S3_BUCKET_NAME")
AWS_ACCESS_KEY_ID=os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY=os.getenv("AWS_SECRET_ACCESS_KEY")

In [6]:
# определение функций для работы с s3
def upload_to_s3(df, file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    
    with BytesIO() as buffer:
        df.to_parquet(buffer)
        buffer.seek(0)
        s3_client.upload_fileobj(buffer, S3_BUCKET_NAME, file_name)
    
    
def download_from_s3(file_name) -> pd.DataFrame:
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        s3_client.download_fileobj(S3_BUCKET_NAME, file_name, buffer)
        buffer.seek(0)
        return pd.read_parquet(buffer)
    
    
def upload_pickle_to_s3(object, file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        pickle.dump(object, buffer)
        buffer.seek(0)
        s3_client.upload_fileobj(buffer, S3_BUCKET_NAME, file_name)
        
def download_pickle_from_s3(file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        s3_client.download_fileobj(S3_BUCKET_NAME, file_name, buffer)
        buffer.seek(0)
        return pickle.load(buffer)

## Выгрузка данных

In [4]:
train_data = download_from_s3("train_data.parquet")
test_data = download_from_s3("test_data.parquet")


In [5]:
target_cols = [col for col in train_data.columns if col.startswith('acc_')]

## Baseline model  
Первый вариант модели с использованием только данных о взаимодействии клиентов с объектами

In [6]:
client_enc = LabelEncoder()

In [25]:
train_data['max_date'] = train_data['div_data'].max()
test_data['max_date'] = test_data['div_data'].max()

train_data['days_from_max_date'] = (train_data['max_date'] - train_data['div_data']).dt.days
test_data['days_from_max_date'] = (test_data['max_date'] - test_data['div_data']).dt.days

train_data['days_from_max_date'] = train_data['days_from_max_date'].astype(int)
test_data['days_from_max_date'] = test_data['days_from_max_date'].astype(int)

train_data['weight'] = train_data['days_from_max_date'] / train_data['days_from_max_date'].max()
test_data['weight'] = test_data['days_from_max_date'] / test_data['days_from_max_date'].max()


In [28]:
for target in target_cols:
    train_data[target] = train_data[target] * train_data['weight']
    test_data[target] = test_data[target] * test_data['weight']

In [30]:
interaction_matrix = train_data[['client_id'] + target_cols]
interaction_matrix['client_id'] = client_enc.fit_transform(interaction_matrix['client_id'])

interaction_matrix = interaction_matrix.groupby('client_id').sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction_matrix['client_id'] = client_enc.fit_transform(interaction_matrix['client_id'])


In [32]:
interaction_matrix.shape

(927690, 25)

In [33]:
acc_to_id = {acc: id for id, acc in enumerate(target_cols)}
id_to_acc = {id: acc for id, acc in enumerate(target_cols)}

In [34]:
interaction_matrix.shape

(927690, 25)

In [35]:
interaction_matrix['accounts'] = [[targets] for targets in interaction_matrix[target_cols].values]
interaction_matrix['accounts'] = interaction_matrix['accounts'].apply(lambda x: x[0])
interaction_matrix['accounts_name'] = [list(id_to_acc.keys())] * interaction_matrix.shape[0]

In [None]:
interaction_matrix = interaction_matrix[['client_id', 'accounts', 'accounts_name']]
del train_data


In [37]:
interaction_matrix.shape

(927690, 3)

In [38]:
# interaction_matrix = interaction_matrix[['client_id', 'accounts', 'accounts_name']].explode(['accounts', 'accounts_name'])

interaction_matrix_exploded = list()
interaction_matrix = interaction_matrix.values

for row in interaction_matrix:
    accounts = row[1]
    accounts_name = row[2]
    client_id = row[0]
    for account, account_name in zip(accounts, accounts_name):
        interaction_matrix_exploded.append((client_id, account, account_name))

interaction_matrix_exploded = pd.DataFrame(interaction_matrix_exploded, columns=['client_id', 'account', 'account_name'])
interaction_matrix_exploded.head()

Unnamed: 0,client_id,account,account_name
0,0,0.0,0
1,0,0.0,1
2,0,6.526027,2
3,0,0.0,3
4,0,0.0,4


In [87]:
scaler = MinMaxScaler()

In [89]:
interaction_matrix_exploded['account'] = scaler.fit_transform(interaction_matrix_exploded[['account']])

In [90]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    interaction_matrix_exploded["account"],
    (interaction_matrix_exploded['client_id'], interaction_matrix_exploded['account_name'])),
    dtype=np.int8)

In [91]:
als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  0%|          | 0/50 [00:00<?, ?it/s]

In [92]:
MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

with open(os.path.join(MODELS_DIR, 'als_model.pkl'), 'wb') as f:
    pickle.dump(als_model, f)

## Model evaluation

In [50]:
client_enc.classes_

array([  15889,   15890,   15891, ..., 1528594, 1528597, 1528598])

In [73]:
id_to_acc

{0: 'acc_savings',
 1: 'acc_garant',
 2: 'acc_current',
 3: 'acc_derivative',
 4: 'acc_salary',
 5: 'acc_child',
 6: 'acc_spec3',
 7: 'acc_spec1',
 8: 'acc_spec2',
 9: 'acc_short_deposit',
 10: 'acc_middle_deposit',
 11: 'acc_long_deposit',
 12: 'acc_digital',
 13: 'acc_cash',
 14: 'acc_mortgage',
 15: 'acc_pension',
 16: 'acc_credit',
 17: 'acc_tax',
 18: 'acc_credit_cart',
 19: 'acc_securities',
 20: 'acc_home',
 21: 'acc_salary_payment',
 22: 'acc_pension_loans',
 23: 'acc_debit'}

In [None]:
test_clients = [client for client in test_data['client_id'].unique() if client in client_enc.classes_]
test_clients_encoded = client_enc.transform(test_clients)
recommendations = als_model.recommend(test_clients_encoded, user_item_matrix_train[test_clients_encoded], filter_already_liked_items=False, N=24)

In [82]:
recommendations_df = pd.DataFrame(
    {
        "client_id": test_clients_encoded,
        "recommendations": [list(el) for el in recommendations[0]],
        "scores": [list(el) for el in recommendations[1]]
    }
)
recommendations_df.head()

Unnamed: 0,client_id,recommendations,scores
0,748902,"[5, 1, 0, 2, 18, 13, 19, 23, 10, 11, 14, 8, 20...","[32.418354, 4.405583, 1.3980354, 1.0063871, 0...."
1,748905,"[5, 1, 2, 0, 18, 13, 19, 23, 10, 11, 14, 8, 16...","[65.14697, 3.6002274, 1.0016865, 0.519425, 0.4..."
2,748901,"[1, 0, 2, 18, 13, 19, 23, 10, 11, 8, 14, 20, 1...","[6.3429184, 2.8707366, 1.0043808, 0.48903415, ..."
3,748906,"[5, 1, 2, 18, 13, 19, 23, 10, 11, 14, 8, 20, 1...","[44.160313, 3.9788911, 1.0003324, 0.48993433, ..."
4,748903,"[5, 1, 0, 2, 18, 13, 19, 23, 10, 11, 14, 8, 16...","[28.635057, 6.1044326, 2.0895166, 1.0013279, 0..."


In [None]:
interaction_matrix_exploded[interaction_matrix_exploded['account'] != 0]

Unnamed: 0,client_id,account,account_name
0,0,0.0,0
1,0,0.0,1
2,0,6.526027,2
3,0,0.0,3
4,0,0.0,4


In [81]:
recommendations[0].shape

(914597, 24)

In [67]:
map(set,recommendations.tolist())

<map at 0x7f0670fd7820>

## Model improvement