In [1]:
import os
import pandas as pd
import numpy as np

from src.config import Config

In [2]:
def interactions_object_listing(interactions: pd.Series, objects: pd.Series) -> str:
    interactions_unique = interactions.unique()
    common_objects = len(np.intersect1d(interactions_unique, objects))
    objects_only_in_interactions = len(np.setdiff1d(interactions_unique, objects))
    objects_only_with_features = len(np.setdiff1d(objects, interactions_unique))
    total_objects = common_objects + objects_only_in_interactions + objects_only_with_features
    return f"Total: {total_objects} (100%)" \
    f"\nObjects with features and interactions: {common_objects} ({common_objects / total_objects * 100:.2f}%)" \
    f"\nObjects with interactions only: {objects_only_in_interactions} ({objects_only_in_interactions / total_objects * 100:.2f}%)" \
    f"\nOnly with features only: {objects_only_with_features} ({objects_only_with_features / total_objects * 100:.2f}%)"

In [3]:
items_df = pd.read_csv(Config.RAW_ITEMS_PATH)
users_df = pd.read_csv(Config.RAW_USERS_PATH)
interactions_df = pd.read_csv(Config.RAW_INTERACTIONS_PATH)

# Interactions preprocessing

In [4]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1533078 entries, 0 to 1533077
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   user_id     1533078 non-null  int64  
 1   item_id     1533078 non-null  int64  
 2   progress    1533078 non-null  int64  
 3   rating      285356 non-null   float64
 4   start_date  1533078 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 58.5+ MB


In [5]:
interactions_df.shape

(1533078, 5)

In [6]:
interactions_df["start_date"] = pd.to_datetime(interactions_df["start_date"])

In [7]:
duplicates = interactions_df.duplicated(subset=['user_id', 'item_id'], keep=False)
df_duplicates = interactions_df[duplicates].sort_values(by=['user_id', 'start_date'])
interactions_df = interactions_df[~duplicates]

In [8]:
interactions_df.shape

(1532918, 5)

In [9]:
df_duplicates = df_duplicates.groupby(['user_id', 'item_id']).agg({
    'progress': 'max',
    'rating': 'max',
    'start_date': 'min'
})
interactions_df = interactions_df.append(df_duplicates.reset_index(), ignore_index=True)

  interactions_df = interactions_df.append(df_duplicates.reset_index(), ignore_index=True)


In [10]:
interactions_df['progress'] = interactions_df['progress'].astype(np.int8)
interactions_df['rating'] = interactions_df['rating'].astype(pd.SparseDtype(np.float32, np.nan))

In [11]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532998 entries, 0 to 1532997
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype               
---  ------      --------------    -----               
 0   user_id     1532998 non-null  int64               
 1   item_id     1532998 non-null  int64               
 2   progress    1532998 non-null  int8                
 3   rating      285355 non-null   Sparse[float32, nan]
 4   start_date  1532998 non-null  datetime64[ns]      
dtypes: Sparse[float32, nan](1), datetime64[ns](1), int64(2), int8(1)
memory usage: 38.7 MB


In [12]:
interactions_df.to_pickle(Config.PREPROCESSED_INTERACTIONS_PATH)

# User preprocessing

In [13]:
users_df.head()

Unnamed: 0,user_id,age,sex
0,1,45_54,
1,2,18_24,0.0
2,3,65_inf,0.0
3,4,18_24,0.0
4,5,35_44,0.0


In [14]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142888 entries, 0 to 142887
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  142888 non-null  int64  
 1   age      142742 non-null  object 
 2   sex      136626 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 3.3+ MB


In [15]:
users_df.nunique()

user_id    142888
age             6
sex             2
dtype: int64

In [16]:
users_df['age'] = users_df['age'].astype('category')
users_df['sex'] = users_df['sex'].astype(pd.SparseDtype(np.float32, np.nan))

In [17]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142888 entries, 0 to 142887
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype               
---  ------   --------------   -----               
 0   user_id  142888 non-null  int64               
 1   age      142742 non-null  category            
 2   sex      136626 non-null  Sparse[float32, nan]
dtypes: Sparse[float32, nan](1), category(1), int64(1)
memory usage: 2.3 MB


In [18]:
print(interactions_object_listing(interactions_df["user_id"], users_df["user_id"]))

Total: 158811 (100%)
Objects with features and interactions: 135677 (85.43%)
Objects with interactions only: 15923 (10.03%)
Only with features only: 7211 (4.54%)


In [19]:
users_df.to_pickle(Config.PREPROCESSED_USERS_PATH)

# Items preprocessing

In [20]:
items_df.head()

Unnamed: 0,id,title,genres,authors,year
0,128115,Ворон-челобитчик,"Зарубежные детские книги,Сказки,Зарубежная кла...",Михаил Салтыков-Щедрин,1886
1,210979,Скрипка Ротшильда,"Классическая проза,Литература 19 века,Русская ...",Антон Чехов,1894
2,95632,Испорченные дети,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1869
3,247906,Странный человек,"Пьесы и драматургия,Литература 19 века",Михаил Лермонтов,1831
4,294280,Господа ташкентцы,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1873


In [21]:
items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59599 entries, 0 to 59598
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       59599 non-null  int64 
 1   title    59599 non-null  object
 2   genres   59568 non-null  object
 3   authors  52714 non-null  object
 4   year     46720 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.3+ MB


In [22]:
def num_bytes_format(num_bytes, float_prec=4):
    units = ['bytes', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb', 'Eb']
    for unit in units[:-1]:
        if abs(num_bytes) < 1000:
            return f'{num_bytes:.{float_prec}f} {unit}'
        num_bytes /= 1000
    return f'{num_bytes:.4f} {units[-1]}'

In [23]:
num_bytes = items_df.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

'28.1660 Mb'

In [24]:
items_df.nunique()

id         59599
title      57358
genres     10769
authors    17265
year        1053
dtype: int64

In [25]:
items_df['year'].value_counts()

2018                   5046
2017                   4505
2019                   4504
2016                   3704
2015                   3419
                       ... 
1957, 1966, 1970          1
1932, 1976                1
1987, 1989                1
1929, 1931                1
1965,1966,1967,1968       1
Name: year, Length: 1053, dtype: int64

In [26]:
for col in ['genres', 'authors', 'year']:
    items_df[col] = items_df[col].astype('category')

In [27]:
num_bytes = items_df.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

'17.7194 Mb'

In [28]:
print(interactions_object_listing(interactions_df["item_id"], items_df["id"]))

Total: 59599 (100%)
Objects with features and interactions: 59599 (100.00%)
Objects with interactions only: 0 (0.00%)
Only with features only: 0 (0.00%)


In [29]:
items_df.to_pickle(Config.PREPROCESSED_ITEMS_PATH)