In [3]:
import os
import numpy as np 
import pandas as pd 
import scipy.sparse as sp
from itertools import islice, cycle
from more_itertools import pairwise
from tqdm.auto import tqdm
from datetime import datetime

### class TimeRangeSplit()

In [4]:
class TimeRangeSplit():
    """
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html
    """
    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='D', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 closed=None, 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError("Either 'end_date' or 'periods' must be non-zero, not both at the same time.")

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.closed = closed
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            closed=closed)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError("Provided parametrs set an empty date range.") 

    def split(self, 
              df, 
              user_column='user_id',
              item_column='item_id',
              datetime_column='date',
              fold_stats=False):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        for start, end in pairwise(date_range):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            if fold_stats:
                fold_info['Train'] = len(train_idx)

            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = np.setdiff1d(
                    df.loc[test_idx, user_column].unique(), 
                    df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)

            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)

            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)]
                # test_mask = rd.df.index.isin(test_idx)
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Test'] = len(test_idx)

            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

### class PopularRecommender()

In [5]:
class PopularRecommender():
    def __init__(self, max_K=100, days=10, item_column='item_id', dt_column='date'):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values
    
    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

In [6]:
data_inter = pd.read_csv('input/interactions_preprocessed.csv')
data_users = pd.read_csv('input/users_preprocessed.csv')
data_items = pd.read_csv('input/items_preprocessed.csv')
data_inter['start_date'] = pd.to_datetime(data_inter['start_date'])

In [7]:
data_inter

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,90133,82910,100,,2018-01-01
1,159130,331068,70,,2018-01-01
2,80061,26540,69,4.0,2018-01-01
3,12811,301895,16,,2018-01-01
4,5778,127872,100,,2018-01-01
...,...,...,...,...,...
1562500,160045,291585,85,,2018-02-25
1562501,161224,7819,58,,2018-04-25
1562502,163719,27040,99,,2018-05-11
1562503,165754,303933,42,0.0,2018-08-09


In [11]:
data_test2 = pd.read_csv('sample_submission2.csv')
data_test2

Unnamed: 0,user_id,item_id
0,10001,313 253 245 205 187 168 155 149 146 128 128 12...
1,10002,313 253 245 205 187 168 155 149 146 128 128 12...
2,100152,313 253 245 205 187 168 155 149 146 128 128 12...
3,100197,313 253 245 205 187 168 155 149 146 128 128 12...
4,100284,313 253 245 205 187 168 155 149 146 128 128 12...
...,...,...
3069,99734,313 253 245 205 187 168 155 149 146 128 128 12...
3070,99757,313 253 245 205 187 168 155 149 146 128 128 12...
3071,99772,313 253 245 205 187 168 155 149 146 128 128 12...
3072,99827,313 253 245 205 187 168 155 149 146 128 128 12...


In [12]:
pop_model = PopularRecommender(dt_column='start_date')# days=7,    
top_N = 10
pop_model.fit(data_inter)
recs = pd.DataFrame({'Id': data_test2['user_id'].unique()})    
recs['Predicted'] = pop_model.recommend(recs['Id'], N=top_N)
recs

Unnamed: 0,Id,Predicted
0,10001,"[283713, 276903, 184549, 168963, 55466, 143175..."
1,10002,"[283713, 276903, 184549, 168963, 55466, 143175..."
2,100152,"[283713, 276903, 184549, 168963, 55466, 143175..."
3,100197,"[283713, 276903, 184549, 168963, 55466, 143175..."
4,100284,"[283713, 276903, 184549, 168963, 55466, 143175..."
...,...,...
3069,99734,"[283713, 276903, 184549, 168963, 55466, 143175..."
3070,99757,"[283713, 276903, 184549, 168963, 55466, 143175..."
3071,99772,"[283713, 276903, 184549, 168963, 55466, 143175..."
3072,99827,"[283713, 276903, 184549, 168963, 55466, 143175..."


In [67]:
big_data = data_inter.merge(data_users) # Соединяем таблицы в одну по столбцу user_id

In [68]:
data_test2 = pd.read_csv('sample_submission2.csv')
big_data_test = data_test2.merge(data_users)

# Тренировка модели

In [69]:
def recomend1(age):    
    pop_model = PopularRecommender(dt_column='start_date')# days=7,    
    top_N = 10
    pop_model.fit(big_data[big_data['age'] == age])
    recs = pd.DataFrame({'Id': big_data_test['user_id'][big_data['age'] == age].unique()})    
    recs['Predicted'] = pop_model.recommend(recs['Id'], N=top_N)
    return recs

### Age == '55_64', '35_44', '25_34', '45_54', '18_24', '65_inf'

In [70]:
all_recomend = pd.DataFrame()
s = 0
for ages in ['55_64', '35_44', '25_34', '45_54', '18_24', '65_inf']: #, big_data['age'][86356]       
    new_recs = recomend1(ages)
    all_recomend = all_recomend.append(new_recs)
all_recomend

Unnamed: 0,Id,Predicted
0,10001,"[283713, 184549, 143175, 55466, 168963, 276903..."
1,100152,"[283713, 184549, 143175, 55466, 168963, 276903..."
2,100197,"[283713, 184549, 143175, 55466, 168963, 276903..."
3,100284,"[283713, 184549, 143175, 55466, 168963, 276903..."
4,10031,"[283713, 184549, 143175, 55466, 168963, 276903..."
...,...,...
53,5563,"[283713, 184549, 143175, 55466, 160349, 374648..."
54,55783,"[283713, 184549, 143175, 55466, 160349, 374648..."
55,55787,"[283713, 184549, 143175, 55466, 160349, 374648..."
56,55816,"[283713, 184549, 143175, 55466, 160349, 374648..."


### Age == None

In [71]:
first_data = list(data_test2['user_id'])
for x in list(all_recomend['Id']):
    first_data.remove(x)

#### Мы присваиваем им самое популярное значение среди всех пользователей

In [72]:
test_dates = data_inter['start_date'].unique()[-7:]
test_dates = list(pairwise(test_dates))
split_dates = test_dates[0]
train = data_inter[data_inter['start_date'] < split_dates[0]]
test = data_inter[(data_inter['start_date'] >= split_dates[0]) & (data_inter['start_date'] < split_dates[1])]
test = test[(test['rating'] >= 4) | (test['rating'].isnull())]
split_dates, train.shape, test.shape

((numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (1547806, 5),
 (1783, 5))

In [73]:
pop_model = PopularRecommender(days=7, dt_column='start_date')
pop_model.fit(train)

In [74]:
new_recs = pd.DataFrame({'Id': first_data})
top_N = 10
new_recs['Predicted'] = pop_model.recommend(new_recs['Id'], N=top_N)
all_recomend = all_recomend.append(new_recs)
all_recomend

Unnamed: 0,Id,Predicted
0,10001,"[283713, 184549, 143175, 55466, 168963, 276903..."
1,100152,"[283713, 184549, 143175, 55466, 168963, 276903..."
2,100197,"[283713, 184549, 143175, 55466, 168963, 276903..."
3,100284,"[283713, 184549, 143175, 55466, 168963, 276903..."
4,10031,"[283713, 184549, 143175, 55466, 168963, 276903..."
...,...,...
542,97203,"[283713, 276903, 184549, 168963, 357309, 38528..."
543,97700,"[283713, 276903, 184549, 168963, 357309, 38528..."
544,97875,"[283713, 276903, 184549, 168963, 357309, 38528..."
545,98659,"[283713, 276903, 184549, 168963, 357309, 38528..."


In [75]:
data_test3 = pd.DataFrame({'Id': data_test2['user_id']})
last_recs = data_test3.merge(all_recomend)
last_recs

Unnamed: 0,Id,Predicted
0,10001,"[283713, 184549, 143175, 55466, 168963, 276903..."
1,10002,"[283713, 276903, 184549, 168963, 357309, 38528..."
2,100152,"[283713, 184549, 143175, 55466, 168963, 276903..."
3,100197,"[283713, 184549, 143175, 55466, 168963, 276903..."
4,100284,"[283713, 184549, 143175, 55466, 168963, 276903..."
...,...,...
3069,99734,"[80003, 357309, 385281, 184549, 56877, 276903,..."
3070,99757,"[80003, 357309, 385281, 184549, 56877, 276903,..."
3071,99772,"[80003, 357309, 385281, 184549, 56877, 276903,..."
3072,99827,"[80003, 283713, 184549, 385281, 287060, 276903..."


In [76]:
last_recs.to_csv('baseline6.csv', index=False)

In [13]:
recs.to_csv('baseline_days10.csv', index=False)