In [1]:
import json
from typing import Tuple
from pathlib import Path
import pandas as pd
import numpy as np

# Предобработка данных

## Чтение данных

In [2]:
def read_raw_data(dir_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, dict]:

    ratings = pd.read_csv(Path(dir_path) / 'train_ratings.csv')
    trainsactions = pd.read_csv(Path(dir_path) / 'train_transactions.csv')
    bookmarks = pd.read_csv(Path(dir_path) / 'train_bookmarks.csv')

    with open(Path(dir_path) / 'catalogue.json', 'r') as f:
        meta_data = json.load(f)

    return trainsactions, ratings, bookmarks, meta_data  

# convert meta dict representation to pd.DataFrame with 
# parsing of attribute and availability value list
def create_availability_columns(meta: pd.DataFrame) -> pd.DataFrame:
    for availability in ['purchase', 'rent', 'subscription']:
        bynary_list = []
        for i in range(len(meta)):
            if availability in meta['availability'][i]:
                bynary_list.append(1)
            else:
                bynary_list.append(0)
        meta[availability] = bynary_list
    meta = meta.drop(columns='availability')
    return meta 

def create_attributes_columns(meta: pd.DataFrame) -> pd.DataFrame:
    max_length_attributes = 0
    for i in meta['attributes']:
        if max_length_attributes < len(i):
            max_length_attributes = len(i)
    
    for i in range(1, max_length_attributes + 1):
        attribute_list = []
        for j in meta['attributes']:
            if i <= len(j):
                attribute_list.append(j[i - 1])
            else:
                attribute_list.append(0)
        meta[f'attribute{i}'] = attribute_list
    meta = meta.drop(columns=['attributes'])
    return meta

def meta_to_df(meta: dict) -> pd.DataFrame:
    meta = pd.DataFrame.from_dict(meta).T
    meta = meta.reset_index(names='element_uid')
    meta = create_availability_columns(meta)
    meta = create_attributes_columns(meta)
    return meta 

In [3]:
trainsactions, ratings, bookmarks, meta = read_raw_data(dir_path='data')
meta = meta_to_df(meta)

## Исследование на пропущенные значения

In [4]:
print(f'Количество пропущенных значений матрицы trainsactions: {trainsactions.isna().sum().sum()}')
print(f'Количество пропущенных значений матрицы ratings: {ratings.isna().sum().sum()}')
print(f'Количество пропущенных значений матрицы bookmarks: {bookmarks.isna().sum().sum()}')
print(f'Количество пропущенных значений матрицы meta: {meta.isna().sum().sum()}')

Количество пропущенных значений матрицы trainsactions: 0
Количество пропущенных значений матрицы ratings: 0
Количество пропущенных значений матрицы bookmarks: 0
Количество пропущенных значений матрицы meta: 0


## Изменение типов данных матриц trainsactions, ratings, meta

### trainsactions

In [5]:
trainsactions.head()

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer
0,2570,408484,S,44304830.0,0,5,90
1,8522,408484,S,44304810.0,0,5,90
2,7642,428798,S,44304800.0,35,5,90
3,8330,428798,S,44304800.0,6350,5,90
4,8546,408484,S,44304770.0,0,5,90


In [6]:
trainsactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7075308 entries, 0 to 7075307
Data columns (total 7 columns):
 #   Column               Dtype  
---  ------               -----  
 0   element_uid          int64  
 1   user_uid             int64  
 2   consumption_mode     object 
 3   ts                   float64
 4   watched_time         int64  
 5   device_type          int64  
 6   device_manufacturer  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 377.9+ MB


In [7]:
from sklearn.preprocessing import LabelEncoder

def consumption_mode_encoder(trainsactions: pd.DataFrame) -> pd.DataFrame:
    encoder = LabelEncoder()
    numeric_consumption_mode = encoder.fit_transform(trainsactions['consumption_mode'].values)
    trainsactions['consumption_mode'] = numeric_consumption_mode.astype(np.int8)
    return trainsactions

def change_trainsaction_dtype(trainsactions: pd.DataFrame) -> pd.DataFrame:
    trainsactions = consumption_mode_encoder(trainsactions)
    trainsactions['ts'] = trainsactions['ts'].astype(np.float32)
    trainsactions['watched_time'] = trainsactions['watched_time'].astype(np.int32)
    trainsactions[['device_type', 'device_manufacturer']] = trainsactions[['device_type', 'device_manufacturer']].astype(np.int8)
    return trainsactions

In [8]:
trainsactions = change_trainsaction_dtype(trainsactions)

### ratings

In [9]:
ratings.head()

Unnamed: 0,user_uid,element_uid,rating,ts
0,571252,3783,10,44080890.0
1,571252,5616,10,44012640.0
2,571252,2639,10,44010990.0
3,63140,2693,10,44218300.0
4,63140,9999,10,44066480.0


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319245 entries, 0 to 319244
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_uid     319245 non-null  int64  
 1   element_uid  319245 non-null  int64  
 2   rating       319245 non-null  int64  
 3   ts           319245 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 9.7 MB


In [11]:
def change_ratings_dtype(ratings: pd.DataFrame) -> pd.DataFrame:
    ratings['rating'] = ratings['rating'].astype(np.int8)
    ratings['ts'] = ratings['ts'].astype(np.float32)
    return ratings

In [12]:
ratings = change_ratings_dtype(ratings)

### meta

In [13]:
meta.head()

Unnamed: 0,element_uid,type,duration,feature_1,feature_2,feature_3,feature_4,feature_5,purchase,rent,...,attribute48,attribute49,attribute50,attribute51,attribute52,attribute53,attribute54,attribute55,attribute56,attribute57
0,1983,movie,140,1657223.396513,0.75361,39,1.119409,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,3783,movie,110,35565207.694893,0.766254,41,1.138604,0.654707,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5208,movie,90,13270676.52431,0.765425,27,1.131807,0.592716,1,1,...,0,0,0,0,0,0,0,0,0,0
3,9744,movie,120,21749917.409823,0.757874,26,1.133525,0.654707,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1912,movie,110,9212963.985682,0.759566,7,1.110127,0.654707,1,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10200 entries, 0 to 10199
Data columns (total 68 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   element_uid   10200 non-null  object
 1   type          10200 non-null  object
 2   duration      10200 non-null  object
 3   feature_1     10200 non-null  object
 4   feature_2     10200 non-null  object
 5   feature_3     10200 non-null  object
 6   feature_4     10200 non-null  object
 7   feature_5     10200 non-null  object
 8   purchase      10200 non-null  int64 
 9   rent          10200 non-null  int64 
 10  subscription  10200 non-null  int64 
 11  attribute1    10200 non-null  int64 
 12  attribute2    10200 non-null  int64 
 13  attribute3    10200 non-null  int64 
 14  attribute4    10200 non-null  int64 
 15  attribute5    10200 non-null  int64 
 16  attribute6    10200 non-null  int64 
 17  attribute7    10200 non-null  int64 
 18  attribute8    10200 non-null  int64 
 19  attr

In [15]:
def meta_type_encoder(meta: pd.DataFrame) -> pd.DataFrame:
    encoder = LabelEncoder()
    numeric_type = encoder.fit_transform(meta['type'].values)
    meta['type'] = numeric_type.astype(np.int8)
    return meta

def change_meta_dtype(meta: pd.DataFrame) -> pd.DataFrame:
    meta['element_uid'] = meta['element_uid'].astype(np.int64)
    meta = meta_type_encoder(meta)
    meta['duration'] = meta['duration'].astype(np.int16)
    meta[['feature_1', 'feature_2', 'feature_4', 'feature_5']] \
         = meta[['feature_1', 'feature_2', 'feature_4', 'feature_5']].astype(np.float32)
    meta['feature_3'] = meta['feature_3'].astype(np.int8)
    meta[['purchase', 'rent', 'subscription']] = meta[['purchase', 'rent', 'subscription']].astype(np.int8)
    meta.loc[:, 'attribute1':] = meta.loc[:, 'attribute1':].astype(np.int32)
    return meta

In [16]:
meta = change_meta_dtype(meta)

  meta.loc[:, 'attribute1':] = meta.loc[:, 'attribute1':].astype(np.int32)


In [17]:
trainsactions.to_parquet('preprocess_data/preprocess_trainsactions.parquet')
meta.to_parquet('preprocess_data/preprocess_meta.parquet')
ratings.to_parquet('preprocess_data/preprocess_ratings.parquet')

In [18]:
ratings.to_parquet('als_data/preprocess_ratings.parquet')
meta.to_parquet('als_data/preprocess_meta.parquet')

## Создание матриц item_features, user_features, trainsactions_train, trainsaction_test

### item_features

In [19]:
def add_meta(meta: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:
    item_features = (
        item_features
        .merge(
            meta,
            how='inner',
            on='element_uid'
        )
    )
    return item_features

def add_content_popularity(trainsactions: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:
    item_features = (
        item_features
        .merge(
            trainsactions.groupby('element_uid').size().reset_index(name='element_occurences'),
            how='left',
            on='element_uid'
        )
    )
    item_features['popularity'] = (
        (item_features['element_occurences'] / item_features['element_uid'].nunique()).astype(np.float32)
    )
    item_features = item_features.drop(columns=['element_occurences'])
    
    return item_features

def add_count_content_bookmark(bookmarks: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:
    bookmarks_per_item = (
        bookmarks
        .groupby('element_uid')
        .size()
        .reset_index(name='element_bookmark_count')
    )
    item_features = (
        item_features
        .merge(
            bookmarks_per_item[['element_uid', 'element_bookmark_count']],
            on='element_uid',
            how='left'
        )
    )
    item_features['element_bookmark_count'].fillna(0, inplace=True)
    item_features['element_bookmark_count'] = item_features['element_bookmark_count'].astype(np.int32)

    return item_features

def create_item_features(meta: pd.DataFrame, trainsactions: pd.DataFrame, bookmarks: pd.DataFrame) -> pd.DataFrame:
    item_features = pd.DataFrame(trainsactions['element_uid'].unique(), columns=['element_uid'])
    print("add meta")
    item_features = add_meta(meta, item_features)
    print("add content popularity")
    item_features = add_content_popularity(trainsactions, item_features)
    print("add count content bookmark")
    item_features = add_count_content_bookmark(bookmarks, item_features)
    return item_features


In [20]:
item_features = create_item_features(meta, trainsactions, bookmarks)

add meta
add content popularity
add count content bookmark


In [21]:
item_features.shape

(8254, 70)

In [22]:
item_features.to_parquet('cb_data/item_features.parquet')

### user_features

In [23]:
def add_favorite_device_type(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    favorite_device_type_per_user = (
        trainsactions
        .groupby(['user_uid', 'device_type'])
        .size()
        .groupby('user_uid')
        .idxmax()
    )
    favorite_device_type_per_user_df = pd.DataFrame([[i[0], i[1]] for i in favorite_device_type_per_user], 
                                                    columns=['user_uid', 'favorite_device_type'])
    user_features = (
        user_features
        .merge(
            favorite_device_type_per_user_df,
            how='left',
            on='user_uid'
        )
    )

    return user_features

def add_favorite_consumption_mode(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    favorite_consumption_mode_per_user = (
        trainsactions
        .groupby(['user_uid', 'consumption_mode'])
        .size()
        .groupby('user_uid')
        .idxmax()
    )
    favorite_consumption_mode_per_user_df = pd.DataFrame([[i[0], i[1]] for i in favorite_consumption_mode_per_user], 
                                                        columns=['user_uid', 'favorite_consumption_mode'])
    
    user_features = (
        user_features
        .merge(
            favorite_consumption_mode_per_user_df,
            how='left',
            on='user_uid'
        )
    )

    return user_features

def add_amount_watched_item(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    user_features = (
        user_features
        .merge(
            trainsactions.groupby('user_uid').size().reset_index(name='user_watch_count'),
            on='user_uid',
            how='left'
        )
    )

    return user_features

def add_mean_watched_time(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame: 
    user_watch_time_mean = (
        trainsactions
        .groupby('user_uid')['watched_time']
        .mean()
        .reset_index(name='user_watch_time_mean')
    )

    user_features = (
        user_features
        .merge(
            user_watch_time_mean,
            on='user_uid',
            how='left'
        )
    )

    return user_features

def change_dtype(user_features: pd.DataFrame) -> pd.DataFrame:
    user_features[['favorite_device_type', 'favorite_consumption_mode']] = \
        user_features[['favorite_device_type', 'favorite_consumption_mode']].astype(np.int8)
    user_features['user_watch_count'] = user_features['user_watch_count'].astype(np.int32)
    user_features['user_watch_time_mean'] = user_features['user_watch_time_mean'].astype(np.float32)

    return user_features
    

def create_user_features(trainsactions: pd.DataFrame) -> pd.DataFrame:
    user_features = pd.DataFrame(trainsactions['user_uid'].unique(), columns=['user_uid'])
    print("add favorite device type")
    user_features = add_favorite_device_type(trainsactions, user_features)
    print("add favorite consumption mode")
    user_features = add_favorite_consumption_mode(trainsactions, user_features)
    print("add amount watched item")
    user_features = add_amount_watched_item(trainsactions, user_features)
    print("add mean watched time")
    user_features = add_mean_watched_time(trainsactions, user_features)
    print("change dtype")
    user_features = change_dtype(user_features)

    return user_features

In [24]:
user_features = create_user_features(trainsactions)

add favorite device type
add favorite consumption mode
add amount watched item
add mean watched time
change dtype


In [25]:
user_features.shape

(254849, 5)

In [26]:
user_features.to_parquet('cb_data/user_features.parquet')

### trainsactions_train и trainsactions_test

In [27]:
trainsactions.shape

(7075308, 7)

In [28]:
def train_test_split(frame: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    frame['user_rank'] = (
        frame
        .groupby('user_uid')['ts']
        .rank('first', ascending=False)
        .astype('int32')
    )
    train = (
        frame
        .query('user_rank > 2')
        .drop(columns=['user_rank'])
    )
    test = (
        frame
        .query('user_rank <= 2')
        .drop(columns=['user_rank'])
    )
    frame.drop(columns=['user_rank'], inplace=True)
    return train, test

In [29]:
train_trainsactions, test_trainsactions = train_test_split(trainsactions)

In [30]:
print(f"train shapes: {train_trainsactions.shape}")
print(f"test shapes: {test_trainsactions.shape}")

train shapes: (6565610, 7)
test shapes: (509698, 7)


In [31]:
train_trainsactions.to_parquet('als_data/als_train.parquet')
test_trainsactions.to_parquet('als_data/als_test.parquet')