# Библиотеки

In [2]:
import os
import numpy as np 
import pandas as pd 
import scipy.sparse as sp
from itertools import islice, cycle
from more_itertools import pairwise
from tqdm.auto import tqdm
from datetime import datetime

### class TimeRangeSplit()

In [3]:
class TimeRangeSplit():
    """
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html
    """
    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='D', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 closed=None, 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError("Either 'end_date' or 'periods' must be non-zero, not both at the same time.")

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.closed = closed
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            closed=closed)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError("Provided parametrs set an empty date range.") 

    def split(self, 
              df, 
              user_column='user_id',
              item_column='item_id',
              datetime_column='date',
              fold_stats=False):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        for start, end in pairwise(date_range):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            if fold_stats:
                fold_info['Train'] = len(train_idx)

            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = np.setdiff1d(
                    df.loc[test_idx, user_column].unique(), 
                    df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)

            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)

            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)]
                # test_mask = rd.df.index.isin(test_idx)
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Test'] = len(test_idx)

            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

### class PopularRecommender()

In [4]:
class PopularRecommender():
    def __init__(self, max_K=100, days=30, item_column='item_id', dt_column='date'):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values
    
    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

In [5]:
data_inter = pd.read_csv('input/interactions_preprocessed.csv')
data_users = pd.read_csv('input/users_preprocessed.csv')
data_items = pd.read_csv('input/items_preprocessed.csv')
data_inter['start_date'] = pd.to_datetime(data_inter['start_date'])

In [8]:
big_data = data_inter.merge(data_users) # Соединяем таблицы в одну по столбцу user_id

In [9]:
data_test2 = pd.read_csv('sample_submission2.csv')
big_data_test = data_test2.merge(data_users)

# Тренировка модели 

In [10]:
def recomend1(age):    
    pop_model = PopularRecommender(dt_column='start_date')# days=7,    
    top_N = 10
    pop_model.fit(big_data[big_data['age'] == age])
    recs = pd.DataFrame({'Id': big_data_test['user_id'][big_data['age'] == age].unique()})    
    recs['Predict'] = pop_model.recommend(recs['Id'], N=top_N)
    return recs

In [11]:
all_recomend = pd.DataFrame()
s = 0
for ages in ['55_64', '35_44', '25_34', '45_54', '18_24', '65_inf']: #, big_data['age'][86356]       
    new_recs = recomend1(ages)
    all_recomend = all_recomend.append(new_recs)
all_recomend

Unnamed: 0,Id,Predict
0,10001,"[283713, 184549, 143175, 55466, 168963, 276903..."
1,100152,"[283713, 184549, 143175, 55466, 168963, 276903..."
2,100197,"[283713, 184549, 143175, 55466, 168963, 276903..."
3,100284,"[283713, 184549, 143175, 55466, 168963, 276903..."
4,10031,"[283713, 184549, 143175, 55466, 168963, 276903..."
...,...,...
53,5563,"[283713, 184549, 143175, 55466, 160349, 374648..."
54,55783,"[283713, 184549, 143175, 55466, 160349, 374648..."
55,55787,"[283713, 184549, 143175, 55466, 160349, 374648..."
56,55816,"[283713, 184549, 143175, 55466, 160349, 374648..."


In [12]:
first_data = list(data_test2['user_id'])
for x in list(all_recomend['Id']):
    first_data.remove(x)

In [13]:
test_dates = data_inter['start_date'].unique()[-7:]
test_dates = list(pairwise(test_dates))
split_dates = test_dates[0]
train = data_inter[data_inter['start_date'] < split_dates[0]]
test = data_inter[(data_inter['start_date'] >= split_dates[0]) & (data_inter['start_date'] < split_dates[1])]
test = test[(test['rating'] >= 4) | (test['rating'].isnull())]
split_dates, train.shape, test.shape

((numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (1547806, 5),
 (1783, 5))

In [14]:
pop_model = PopularRecommender(days=7, dt_column='start_date')
pop_model.fit(train)

In [15]:
new_recs = pd.DataFrame({'Id': first_data})
top_N = 10
new_recs['Predict'] = pop_model.recommend(new_recs['Id'], N=top_N)
all_recomend = all_recomend.append(new_recs)
all_recomend

Unnamed: 0,Id,Predict
0,10001,"[283713, 184549, 143175, 55466, 168963, 276903..."
1,100152,"[283713, 184549, 143175, 55466, 168963, 276903..."
2,100197,"[283713, 184549, 143175, 55466, 168963, 276903..."
3,100284,"[283713, 184549, 143175, 55466, 168963, 276903..."
4,10031,"[283713, 184549, 143175, 55466, 168963, 276903..."
...,...,...
542,97203,"[283713, 276903, 184549, 168963, 357309, 38528..."
543,97700,"[283713, 276903, 184549, 168963, 357309, 38528..."
544,97875,"[283713, 276903, 184549, 168963, 357309, 38528..."
545,98659,"[283713, 276903, 184549, 168963, 357309, 38528..."


In [16]:
data_test3 = pd.DataFrame({'Id': data_test2['user_id']})
last_recs = data_test3.merge(all_recomend)
last_recs

Unnamed: 0,Id,Predict
0,10001,"[283713, 184549, 143175, 55466, 168963, 276903..."
1,10002,"[283713, 276903, 184549, 168963, 357309, 38528..."
2,100152,"[283713, 184549, 143175, 55466, 168963, 276903..."
3,100197,"[283713, 184549, 143175, 55466, 168963, 276903..."
4,100284,"[283713, 184549, 143175, 55466, 168963, 276903..."
...,...,...
3069,99734,"[80003, 357309, 385281, 184549, 56877, 276903,..."
3070,99757,"[80003, 357309, 385281, 184549, 56877, 276903,..."
3071,99772,"[80003, 357309, 385281, 184549, 56877, 276903,..."
3072,99827,"[80003, 283713, 184549, 385281, 287060, 276903..."


## Sex 1.0

In [17]:
pop_model_sex1 = PopularRecommender( dt_column='start_date')# days=7,
pop_model_sex1.fit(big_data[big_data['sex'] == 1.0])

In [18]:
top10_recs_sex1 = pop_model_sex1.recommend()
top10_recs_sex1

array([283713, 184549, 276903, 357309,  55466, 385281, 143175, 352049,
       168963, 112869], dtype=int64)

In [19]:
recs = pd.DataFrame({'Id': big_data_test['user_id'][big_data_test['sex'] == 1.0].unique()})
top_N = 10
recs['Predict'] = pop_model_sex1.recommend(recs['Id'], N=top_N)
recs

Unnamed: 0,Id,Predict
0,100197,"[283713, 184549, 276903, 357309, 55466, 385281..."
1,100284,"[283713, 184549, 276903, 357309, 55466, 385281..."
2,10031,"[283713, 184549, 276903, 357309, 55466, 385281..."
3,100412,"[283713, 184549, 276903, 357309, 55466, 385281..."
4,100562,"[283713, 184549, 276903, 357309, 55466, 385281..."
...,...,...
643,98220,"[283713, 184549, 276903, 357309, 55466, 385281..."
644,98593,"[283713, 184549, 276903, 357309, 55466, 385281..."
645,98891,"[283713, 184549, 276903, 357309, 55466, 385281..."
646,98906,"[283713, 184549, 276903, 357309, 55466, 385281..."


## Sex 0.0

In [20]:
pop_model_sex2 = PopularRecommender(days=7, dt_column='start_date')
pop_model_sex2.fit(big_data[big_data['sex'] == 0.0])

In [21]:
top10_recs_sex2 = pop_model_sex2.recommend()
top10_recs_sex2

array([283713,  89130, 184549, 344047, 276903, 168963,  80003, 143175,
        56877, 385281], dtype=int64)

In [23]:
recs2 = pd.DataFrame({'Id': big_data_test['user_id'][big_data_test['sex'] == 0.0].unique()})
top_N = 10
recs2['Predict'] = pop_model_sex2.recommend(recs2['Id'], N=top_N)
recs2.head()

Unnamed: 0,Id,Predict
0,10001,"[283713, 89130, 184549, 344047, 276903, 168963..."
1,100152,"[283713, 89130, 184549, 344047, 276903, 168963..."
2,100428,"[283713, 89130, 184549, 344047, 276903, 168963..."
3,100450,"[283713, 89130, 184549, 344047, 276903, 168963..."
4,100735,"[283713, 89130, 184549, 344047, 276903, 168963..."


## Sex = NaN

In [24]:
first_data = list(data_test2['user_id'])
for x in list(recs.append(recs2)['Id']):
    first_data.remove(x)

## Валидация

In [25]:
test_dates = data_inter['start_date'].unique()[-7:]
test_dates = list(pairwise(test_dates))
split_dates = test_dates[0]
train = data_inter[data_inter['start_date'] < split_dates[0]]
test = data_inter[(data_inter['start_date'] >= split_dates[0]) & (data_inter['start_date'] < split_dates[1])]
test = test[(test['rating'] >= 4) | (test['rating'].isnull())]
split_dates, train.shape, test.shape

((numpy.datetime64('2019-12-25T00:00:00.000000000'),
  numpy.datetime64('2019-12-26T00:00:00.000000000')),
 (1547806, 5),
 (1783, 5))

In [26]:
pop_model = PopularRecommender(days=7, dt_column='start_date')
pop_model.fit(train)

In [27]:
top10_recs = pop_model.recommend()
top10_recs
#283713, 276903, 184549, 168963, 357309, 385281,  50718, 112869,242176,  93751] на train
#[283713, 276903, 184549, 168963,  55466, 385281, 357309, 352049,143175, 267817] на всей

array([283713, 276903, 184549, 168963, 357309, 385281,  50718, 112869,
       242176,  93751], dtype=int64)

In [28]:
recs3 = pd.DataFrame({'Id': first_data})
top_N = 10
recs3['Predict'] = pop_model.recommend(recs3['Id'], N=top_N)
recs3.head()

Unnamed: 0,Id,Predict
0,10002,"[283713, 276903, 184549, 168963, 357309, 38528..."
1,100959,"[283713, 276903, 184549, 168963, 357309, 38528..."
2,101968,"[283713, 276903, 184549, 168963, 357309, 38528..."
3,10197,"[283713, 276903, 184549, 168963, 357309, 38528..."
4,102088,"[283713, 276903, 184549, 168963, 357309, 38528..."


In [29]:
data_test3 = pd.DataFrame({'Id': data_test2['user_id']})
data_test3.head()

Unnamed: 0,Id
0,10001
1,10002
2,100152
3,100197
4,100284


# Соединяем всё в одну таблицу

In [30]:
last_recs = data_test3.merge(recs.append(recs2.append(recs3)))
last_recs.head()

Unnamed: 0,Id,Predict
0,10001,"[283713, 89130, 184549, 344047, 276903, 168963..."
1,10002,"[283713, 276903, 184549, 168963, 357309, 38528..."
2,100152,"[283713, 89130, 184549, 344047, 276903, 168963..."
3,100197,"[283713, 184549, 276903, 357309, 55466, 385281..."
4,100284,"[283713, 184549, 276903, 357309, 55466, 385281..."


In [111]:
last_recs.to_csv('baseline4.csv', index=False)