In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime as dt
import boto3
from io import BytesIO
import pickle
import os
from dotenv import load_dotenv
import scipy
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import MinMaxScaler

plt.style.use('ggplot')



In [2]:
load_dotenv()

S3_BUCKET_NAME=os.getenv("S3_BUCKET_NAME")
AWS_ACCESS_KEY_ID=os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY=os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
# определение функций для работы с s3
def upload_to_s3(df, file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    
    with BytesIO() as buffer:
        df.to_parquet(buffer)
        buffer.seek(0)
        s3_client.upload_fileobj(buffer, S3_BUCKET_NAME, file_name)
    
    
def download_from_s3(file_name) -> pd.DataFrame:
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        s3_client.download_fileobj(S3_BUCKET_NAME, file_name, buffer)
        buffer.seek(0)
        return pd.read_parquet(buffer)
    
    
def upload_pickle_to_s3(object, file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        pickle.dump(object, buffer)
        buffer.seek(0)
        s3_client.upload_fileobj(buffer, S3_BUCKET_NAME, file_name)
        
def download_pickle_from_s3(file_name):
    s3_client = boto3.client(
        "s3",
        endpoint_url='https://storage.yandexcloud.net',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    with BytesIO() as buffer:
        s3_client.download_fileobj(S3_BUCKET_NAME, file_name, buffer)
        buffer.seek(0)
        return pickle.load(buffer)

## Выгрузка данных

In [4]:
data = download_from_s3('bank_products_processed.parquet')

In [5]:
split_date = '2016-01-28'

train_data = data[data['div_data'] <= split_date]
test_data = data[data['div_data'] > split_date]


In [6]:
target_cols = [col for col in train_data.columns if col.startswith('acc_')]

## Baseline model  
Первый вариант модели с использованием только данных о взаимодействии клиентов с объектами

In [7]:
client_enc = LabelEncoder()

In [8]:
train_data['max_date'] = train_data['div_data'].max()
test_data['max_date'] = test_data['div_data'].max()

train_data['days_from_max_date'] = (train_data['max_date'] - train_data['div_data']).dt.days
test_data['days_from_max_date'] = (test_data['max_date'] - test_data['div_data']).dt.days

train_data['days_from_max_date'] = train_data['days_from_max_date'].astype(int)
test_data['days_from_max_date'] = test_data['days_from_max_date'].astype(int)

train_data['weight'] = train_data['days_from_max_date'] / train_data['days_from_max_date'].max()
test_data['weight'] = test_data['days_from_max_date'] / test_data['days_from_max_date'].max()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['max_date'] = train_data['div_data'].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['max_date'] = test_data['div_data'].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['days_from_max_date'] = (train_data['max_date'] - train_data['div_data']).dt.days
A valu

In [9]:
for target in target_cols:
    train_data[target] = train_data[target] * train_data['weight']
    test_data[target] = test_data[target] * test_data['weight']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[target] = train_data[target] * train_data['weight']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[target] = test_data[target] * test_data['weight']


In [10]:
interaction_matrix = train_data[['client_id'] + target_cols]
interaction_matrix['client_id'] = client_enc.fit_transform(interaction_matrix['client_id'])

interaction_matrix = interaction_matrix.groupby('client_id').sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction_matrix['client_id'] = client_enc.fit_transform(interaction_matrix['client_id'])


In [11]:
interaction_matrix.shape

(927690, 25)

In [12]:
acc_to_id = {acc: id for id, acc in enumerate(target_cols)}
id_to_acc = {id: acc for id, acc in enumerate(target_cols)}

In [13]:
interaction_matrix.shape

(927690, 25)

In [14]:
interaction_matrix['accounts'] = [[targets] for targets in interaction_matrix[target_cols].values]
interaction_matrix['accounts'] = interaction_matrix['accounts'].apply(lambda x: x[0])
interaction_matrix['accounts_name'] = [list(id_to_acc.keys())] * interaction_matrix.shape[0]

In [15]:
interaction_matrix = interaction_matrix[['client_id', 'accounts', 'accounts_name']]
del train_data


In [16]:
interaction_matrix.shape

(927690, 3)

In [17]:
# interaction_matrix = interaction_matrix[['client_id', 'accounts', 'accounts_name']].explode(['accounts', 'accounts_name'])

interaction_matrix_exploded = list()
interaction_matrix = interaction_matrix.values

for row in interaction_matrix:
    accounts = row[1]
    accounts_name = row[2]
    client_id = row[0]
    for account, account_name in zip(accounts, accounts_name):
        interaction_matrix_exploded.append((client_id, account, account_name))

interaction_matrix_exploded = pd.DataFrame(interaction_matrix_exploded, columns=['client_id', 'account', 'account_name'])
interaction_matrix_exploded = interaction_matrix_exploded[interaction_matrix_exploded['account'] > 0]
interaction_matrix_exploded.head()

Unnamed: 0,client_id,account,account_name
2,0,6.860274,2
8,0,6.860274,8
18,0,2.846575,18
19,0,6.860274,19
28,1,6.860274,4


In [18]:
# interaction_matrix_exploded['account'] = scaler.fit_transform(interaction_matrix_exploded[['account']])

In [19]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    interaction_matrix_exploded["account"],
    (interaction_matrix_exploded['client_id'], interaction_matrix_exploded['account_name'])),
    dtype=np.float32)

In [20]:
als_model = AlternatingLeastSquares(factors=50, iterations=25, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  check_blas_config()


  0%|          | 0/25 [00:00<?, ?it/s]

In [21]:
upload_pickle_to_s3(als_model, 'als_model.pkl')

## Model evaluation

In [22]:
als_model = download_pickle_from_s3('als_model.pkl')

In [50]:
client_enc.classes_

array([  15889,   15890,   15891, ..., 1528594, 1528597, 1528598])

In [24]:
test_clients = [client for client in test_data['client_id'].unique() if client in client_enc.classes_]
test_clients_encoded = client_enc.transform(test_clients)
recommendations = als_model.recommend(test_clients_encoded, user_item_matrix_train[test_clients_encoded], filter_already_liked_items=True, N=24)

In [33]:
test_clients_encoded

array([748902, 748905, 748901, ..., 592149, 593074, 587986])

In [35]:
user_item_matrix_train

<927690x24 sparse matrix of type '<class 'numpy.int8'>'
	with 22264560 stored elements in Compressed Sparse Row format>

In [30]:
test_data[target_cols + ['client_id']].sample(10)

Unnamed: 0,acc_savings,acc_garant,acc_current,acc_derivative,acc_salary,acc_child,acc_spec3,acc_spec1,acc_spec2,acc_short_deposit,...,acc_pension,acc_credit,acc_tax,acc_credit_cart,acc_securities,acc_home,acc_salary_payment,acc_pension_loans,acc_debit,client_id
14211719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1419836
12819568,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,745242
13833938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,191870
11371919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,237000
11522536,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1157116
12164721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.677778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,812730
11384690,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1058972
11763291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1512707
12006280,0.0,0.0,0.677778,0.0,0.0,0.0,0.0,0.677778,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29946
12162837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,816101


In [45]:
client_enc.inverse_transform([748902])[0]

1319846

In [31]:
client_enc.transform([29946])

array([6871])

In [50]:
train_data.shape

(10779253, 48)

In [32]:
als_model.recommend(6871, user_item_matrix_train[6871], filter_already_liked_items=True, N=24)

(array([ 0, 21, 22,  4, 20,  1, 11, 10, 18, 19, 13, 15,  5, 17, 16, 14, 23,
         6,  9, 12,  8,  3,  7,  2], dtype=int32),
 array([ 1.6334779e-03,  1.1728741e-03,  1.0560155e-03,  9.1965683e-04,
         7.7000633e-04,  6.7028869e-04,  4.9938075e-04,  4.0197000e-04,
         3.8554519e-04,  3.0330755e-04,  1.3864134e-04,  1.9760802e-05,
        -1.3917685e-05, -2.3150817e-04, -2.4284050e-04, -3.6528986e-04,
        -4.0830672e-04, -5.0593168e-04, -6.7150220e-04, -6.8253092e-04,
        -7.0172921e-04, -2.0806175e-03, -3.4028235e+38, -3.4028235e+38],
       dtype=float32))

In [28]:

als_model.recommend(748901, user_item_matrix_train[748901], filter_already_liked_items=True, N=24)

(array([21,  4, 22, 23, 18, 12,  3, 17,  6, 14, 11,  5, 16,  9, 15, 19,  8,
        20, 10, 13,  1,  0,  7,  2], dtype=int32),
 array([ 2.48042867e-03,  1.98100135e-03,  1.77556463e-03,  1.65462308e-03,
         8.83758068e-04,  6.52389601e-04,  3.30764800e-04,  2.34103296e-04,
         1.39936805e-04,  9.44328494e-05,  7.65947625e-05,  4.49977815e-05,
         6.88806176e-06, -1.46999955e-05, -8.02073628e-05, -8.70133517e-05,
        -9.16812569e-05, -1.14681199e-04, -1.63521618e-04, -2.12425366e-04,
        -2.80889974e-04, -3.28164781e-04, -5.95470890e-04, -3.40282347e+38],
       dtype=float32))

In [25]:
recommendations_df = pd.DataFrame(
    {
        "client_id": test_clients_encoded,
        "recommendations": [list(el) for el in recommendations[0]],
        "scores": [list(el) for el in recommendations[1]]
    }
)
recommendations_df.head()

Unnamed: 0,client_id,recommendations,scores
0,748902,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
1,748905,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
2,748901,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
3,748906,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
4,748903,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."


In [26]:
recommendations_df.sample(10)

Unnamed: 0,client_id,recommendations,scores
440030,48902,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
676672,637583,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
753506,381427,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
101944,717325,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
485078,6845,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
622700,557410,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
138437,881690,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
422588,274712,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
815494,360279,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."
554601,98704,"[23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 1...","[-3.4028235e+38, -3.4028235e+38, -3.4028235e+3..."


In [None]:
interaction_matrix_exploded[interaction_matrix_exploded['account'] != 0]

Unnamed: 0,client_id,account,account_name
0,0,0.0,0
1,0,0.0,1
2,0,6.526027,2
3,0,0.0,3
4,0,0.0,4


In [81]:
recommendations[0].shape

(914597, 24)

In [67]:
map(set,recommendations.tolist())

<map at 0x7f0670fd7820>

## Model improvement