In [1]:
from typing import Tuple
from pathlib import Path
import pandas as pd
import numpy as np

# Обучение ALS

## Предобработка данных

In [2]:
def read_data(dir_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    als_train = pd.read_parquet(Path(dir_path) / 'als_train.parquet')
    als_test = pd.read_parquet(Path(dir_path) / 'als_test.parquet')
    ratings = pd.read_parquet(Path(dir_path) / 'preprocess_ratings.parquet')
    meta = pd.read_parquet(Path(dir_path) / 'preprocess_meta.parquet')

    return als_train, als_test, ratings, meta 

In [3]:
als_train, als_test, ratings, meta = read_data('als_data')

In [4]:
als_train.shape

(6565610, 7)

### Аппроксимация рейтингов пользователей

In [5]:
def score_transaction(t):
    score = 1
    if t['duration'] > 0:
        if t['type'] == 0:
            if t['watched_ratio'] > 0.3:
                score = int(t['watched_ratio'] * 9) + 1
        else:
            if t['watched_ratio'] > 1:
                score = int(t['watched_ratio'] / 2) + 1
    return min(score, 10)

def trainsactions_ratings_approximation(trainsactions: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
    trainsactions_with_meta = (
        trainsactions
        .merge(meta[[
            'element_uid', 
            'duration',
            'type',
        ]], on='element_uid', how='left')
    )
    trainsactions_with_meta['watched_ratio'] = (
        trainsactions_with_meta['watched_time'] / 
        trainsactions_with_meta['duration']
    )
    print("score trainsaction")
    trainsactions_with_meta['score'] = trainsactions_with_meta.apply(score_transaction, axis=1)
    
    return trainsactions_with_meta    

In [6]:
als_train_score = trainsactions_ratings_approximation(als_train, meta)

score trainsaction


### Приведение матрицы als_train к виду user_uid, element_uid, score

In [7]:
def score_matrix(als_train_score: pd.DataFrame) -> pd.DataFrame:
    return als_train_score[['element_uid', 'user_uid', 'score']]

In [8]:
als_train_score = score_matrix(als_train_score)

### Соединение матриц als_train_score и ratings

In [9]:
def merge_trainsactions_ratings(trainsactions_ratings: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame:
    trainsactions_ratings = (
        trainsactions_ratings
        .merge(
            ratings[['user_uid', 'element_uid', 'rating']],
            on=['user_uid', 'element_uid'],
            how='outer'
        )
    )

    trainsactions_ratings['score'].fillna(
        trainsactions_ratings['rating'], inplace=True)
    
    return trainsactions_ratings

In [10]:
als_train_score_rating = merge_trainsactions_ratings(als_train_score, ratings)

### Изменение типа данных матрицы als_train_score_rating

In [11]:
als_train_score_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6636716 entries, 0 to 6636715
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   element_uid  int64  
 1   user_uid     int64  
 2   score        float64
 3   rating       float64
dtypes: float64(2), int64(2)
memory usage: 253.2 MB


In [12]:
def change_als_train_dtype(als_train: pd.DataFrame) -> pd.DataFrame:
    als_train['score'] = als_train['score'].astype(np.int8)
    als_train.drop(columns='rating', inplace=True)
    return als_train

In [13]:
als_train_score_rating = change_als_train_dtype(als_train_score_rating)

In [14]:
als_train_score_rating.shape

(6636716, 3)

### Применение TF_IDF encoder для score

In [15]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

def encode_tfidf_coo(als_train: pd.DataFrame) -> pd.DataFrame:
    score_sum_per_user = (
        als_train
        .groupby('user_uid')['score']
        .transform('sum')
    )
    user_count_per_element = (
        als_train
        .groupby('element_uid')['user_uid']
        .transform('size')
    )
    tf = als_train['score'].values / score_sum_per_user.values
    idf = np.log(len(score_sum_per_user) / user_count_per_element.values)
    
    tfidf = als_train[['user_uid', 'element_uid']].copy()
    tfidf['value'] = tf * idf

    return tfidf

def encode_tfidf(als_train: pd.DataFrame) -> Tuple[LabelEncoder, LabelEncoder, csr_matrix]:
    tfidf = encode_tfidf_coo(als_train)
    
    n_users = tfidf['user_uid'].nunique()
    n_elements = tfidf['element_uid'].nunique()

    user_encoder = LabelEncoder()
    element_encoder = LabelEncoder()
    user_index = user_encoder.fit_transform(als_train['user_uid'].values)
    element_index = element_encoder.fit_transform(als_train['element_uid'].values)

    tfidf_csr = csr_matrix(
        (
            tfidf['value'].astype('float32').values,
            (user_index, element_index)
        ),
        shape=(n_users, n_elements)
    )

    return user_encoder, element_encoder, tfidf_csr


In [16]:
user_encoder, element_encoder, als_train_csr = encode_tfidf(als_train_score_rating)

In [17]:
als_train_csr.shape

(254849, 8545)

In [18]:
print(f'sparsity = {(1 - als_train_csr.nnz / np.prod(als_train_csr.shape)) * 100:.4f}%') 

sparsity = 99.6952%


## Обучение ALS и получение рекомендаций

### Обучение

In [19]:
from implicit.als import AlternatingLeastSquares

als = AlternatingLeastSquares(factors=128, iterations=30, calculate_training_loss=True)
als.fit(als_train_csr)

  0%|          | 0/30 [00:00<?, ?it/s]

### Получение рекомендаций

In [28]:
recommendations_matrix, recommendations_scores = als.recommend(
    np.arange(0, als_train_csr.shape[0]), 
    als_train_csr, 
    N=50, 
    filter_already_liked_items=True
)

In [29]:
recommendations_matrix

array([[1375,  506, 4726, ..., 6473, 5062, 6925],
       [2236, 1305, 5200, ..., 4150, 1521, 6474],
       [5112, 1105, 4789, ..., 6001, 3774, 3838],
       ...,
       [2094, 3255, 8115, ..., 3931, 2785, 4629],
       [  55, 1140, 2374, ..., 2036, 7464, 5277],
       [3639, 6780, 2571, ...,  608, 4596, 1340]], dtype=int32)

In [30]:
recommendations_scores.shape

(254849, 50)

In [31]:
def als_recommendations_to_df(
    recommendations_matrix: np.ndarray, 
    recommendations_scores: np.ndarray,
    user_encoder: LabelEncoder, 
    item_encoder: LabelEncoder,
    user_key = 'user_id',
    item_key = 'item_id'
) -> pd.DataFrame:
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    user_mapping = pd.DataFrame({
        'user_index': np.arange(0, len(user_encoder.classes_)),
        user_key: user_encoder.classes_,
    })

    item_mapping = pd.DataFrame({
        'item_index': np.arange(0, len(item_encoder.classes_)),
        item_key: item_encoder.classes_,
    })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations

In [32]:
recommendations = als_recommendations_to_df(
    recommendations_matrix,
    recommendations_scores,
    user_encoder,
    element_encoder,
    user_key='user_uid',
    item_key='element_uid',
)

In [33]:
recommendations.shape

(12742450, 3)

In [34]:
recommendations.to_parquet('cb_data/recommendations.parquet')