In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
from tqdm.contrib import tzip
from sklearn.model_selection import train_test_split
from catboost import CatBoostRanker,Pool,cv,CatBoostRegressor, CatBoostClassifier
import gc

In [2]:
train_df = pd.read_csv('./train.csv',parse_dates=['timestamp'])
user_data = pd.read_csv('user.csv')
video_data = pd.read_csv('video.csv')
owner_data = pd.read_csv('owner.csv')

In [None]:
val_predicts_tf = pd.read_parquet("TFIDF_100_preds_val.parquet")
val_predicts_als = pd.read_parquet("ALS_100_preds_val.parquet")

In [3]:
def chrono_split(
        df: pd.DataFrame, 
        split_by_column: str = 'user_id', 
        ratio: float = 0.7, 
        col_timestamp: str = 'timestamp'):

    df = df.sort_values([split_by_column, col_timestamp])
    groups = df.groupby(split_by_column)

    df["count"] = groups[split_by_column].transform("count")
    df["rank_s"] = groups.cumcount() + 1

    ratio = [ratio, 1 - ratio]
    splits = []
    prev_threshold = None
    for threshold in np.cumsum(ratio):
        condition = df["rank_s"] <= round(threshold * df["count"])
        if prev_threshold is not None:
            condition &= df["rank_s"] > round(prev_threshold * df["count"])
        splits.append(df[condition].drop(["rank_s", "count"], axis=1))
        prev_threshold = threshold

    return splits

def train_val_split(
        train_df: pd.DataFrame, 
        val_users_n: int = 200_000):
    
    user_ids = train_df['user_id'].unique()
    user_ids_val = random.sample(list(user_ids), val_users_n)
    condition = train_df['user_id'].isin(user_ids_val)

    val = train_df[condition]
    val_no_targets, val_targets = chrono_split(val, ratio=0.7)

    train = pd.concat([train_df[~condition], val_no_targets]).sort_values('timestamp')
    return train, val_no_targets, val_targets

In [4]:
random.seed(56)
train, val_no_targets, val_targets = train_val_split(train_df,val_users_n=100_000)

In [6]:
val_group = val_targets.groupby('user_id')
act = val_group['video_id'].agg(lambda x:x.tolist()).tolist()
val_users = val_group.agg(lambda x:x.tolist()).index.tolist()
val_predicts_tf['user_id'] = val_users
val_predicts_tf['rank_i'] = val_predicts_tf['preds'].map(lambda x: list(range(len(x))))
val_predicts_als['user_id'] = val_users
val_predicts_als['rank_i'] = val_predicts_als['preds'].map(lambda x: list(range(len(x))))

In [5]:
def create_data(predicts):
    ranks = []
    video_id = []
    user_id = []
    rank_id = []
    for idx in tqdm(predicts.index):
        id = predicts.loc[idx]['user_id']
        for pred,rank,r_id in zip(predicts['preds'][idx],predicts['scores'][idx],predicts['rank_i'][idx]):
            ranks.append(rank)
            video_id.append(pred)
            user_id.append(id)
            rank_id.append(r_id)
    
    df = pd.DataFrame()
    df['user_id'] = user_id
    df['ranks'] = ranks
    df['rank_id'] = rank_id
    df['video_id'] = video_id
    
    return df

In [8]:
df_tf = create_data(val_predicts_tf)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [9]:
df_als = create_data(val_predicts_als)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [10]:
df_tf = df_tf.rename({'ranks':'ranks_tf','rank_id':'rank_id_tf'},axis=1)
df_als = df_als.rename({'ranks':'ranks_als','rank_id':'rank_id_als'},axis=1)

In [11]:
tqdm.pandas()
df_tf['user_id_per_video_id'] = df_tf.progress_apply(lambda x: str(x.video_id) + '_' + str(x.user_id),axis=1)
df_als['user_id_per_video_id'] = df_als.progress_apply(lambda x: str(x.video_id) + '_' + str(x.user_id),axis=1)

  0%|          | 0/9995539 [00:00<?, ?it/s]

  0%|          | 0/10000000 [00:00<?, ?it/s]

In [12]:
df_als = df_als.set_index('user_id_per_video_id')
df_tf = df_tf.set_index('user_id_per_video_id')

In [13]:
df_tf

Unnamed: 0_level_0,user_id,ranks_tf,rank_id_tf,video_id
user_id_per_video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
180158.0_0.0,0,2.503461,0,180158
25005.0_0.0,0,2.279144,1,25005
234577.0_0.0,0,1.912787,2,234577
223373.0_0.0,0,1.843318,3,223373
216068.0_0.0,0,1.643954,4,216068
...,...,...,...,...
98504.0_152909.0,152909,1.058487,95,98504
179093.0_152909.0,152909,1.050762,96,179093
166381.0_152909.0,152909,1.044082,97,166381
83663.0_152909.0,152909,1.013861,98,83663


In [15]:
df_all = pd.concat([df_als,df_tf],axis=1)

In [26]:
df_all['user_id'] = df_all['user_id'].fillna(-1)
users_id = np.max(df_all['user_id'].values,axis=-1)

In [34]:
df_all['video_id'] = df_all['video_id'].fillna(-1)
video_id = np.max(df_all['video_id'].values,axis=-1)

In [36]:
video_id.shape

(15760373,)

In [37]:
df_all = df_all.drop(['user_id','video_id'],axis=1)
df_all['video_id'] = video_id

NameError: name 'user_id' is not defined

In [38]:
df_all['user_id'] = users_id

In [39]:
df_all['rank_id_als'] = df_all['rank_id_als'].fillna(500)
df_all['rank_id_tf'] = df_all['rank_id_tf'].fillna(500)
df_all['ranks_als'] = df_all['ranks_als'].fillna(-1)
df_all['ranks_tf'] = df_all['ranks_tf'].fillna(-1)

In [61]:
del df_als
del df_tf

In [42]:
df = df_all

In [43]:
val_targets['video_id_liked'] = val_targets.apply(lambda x: x.video_id if x.interaction_type == 'like' else -1,axis=1)

In [44]:
val_targets_base = val_targets.groupby('user_id')['video_id'].agg(lambda x:x.tolist())
val_targets_like = val_targets.groupby('user_id')['video_id_liked'].agg(lambda x:x.tolist())

In [45]:
val_targets_like = val_targets_like.map(set)
val_targets_base = val_targets_base.map(set)

In [46]:
def set_label(df,val_targets_like,val_targets_base):
    labels = []
    for user,item in tzip(df['user_id'],df['video_id']):
        if item in val_targets_like[user]:
            labels += [1]
        elif item in val_targets_base[user]:
            labels += [0.1]
        else:
            labels += [0]
    df['label'] = labels
    return df

In [47]:
df = set_label(df,val_targets_like,val_targets_base)

  0%|          | 0/15760373 [00:09<?, ?it/s]

In [49]:
df = df.reset_index()

In [51]:
df.to_parquet('DF.parquet.gzip',compression='gzip')

In [7]:
df = pd.read_parquet('DF.parquet.gzip')

In [5]:
def get_video_like_watching_features(df):
    df['is_like'] = df['interaction_type'].map(lambda x:1 if x == 'like' else 0)
    features = df.groupby('video_id')['is_like'].agg(['sum','mean','count'])
    #features['video_mean_age'] = df.groupby('video_id')['age'].agg(['mean'])
    #features['video_std_age'] = df.groupby('video_id')['age'].agg(['std'])
    features['_'] = range(features.shape[0])
    features['video_id'] = features.index
    features = features.set_index('_')
    return features

video_like_faeatures = get_video_like_watching_features(train)

In [6]:
video_data = video_data.merge(owner_data)

In [7]:
video_data = video_data.merge(video_like_faeatures)

In [8]:
def generate_video_owner_features(video_data):
    video_data['owner_count'] = video_data.groupby('owner_id')['video_id'].transform('count')
    video_data['owner_mean_duration'] = video_data.groupby('owner_id')['duration'].transform('mean')
    video_data['owner_min_duration'] = video_data.groupby('owner_id')['duration'].transform('min')
    video_data['owner_max_duration'] = video_data.groupby('owner_id')['duration'].transform('max')
    video_data['owner_std_duration'] = video_data.groupby('owner_id')['duration'].transform('std')
    video_data['video_hour_after_create'] = (video_data['upload_timestamp'] - video_data['create_date']) / 3600
    video_data['video_hour_after_last_active'] = (video_data['upload_timestamp'] - video_data['last_active_date']) / 3600
    
    video_data['owner_like_sum'] = video_data.groupby('owner_id')['sum'].transform('sum')
    video_data['owner_like_mean'] = video_data.groupby('owner_id')['sum'].transform('mean')
    video_data['owner_like_max'] = video_data.groupby('owner_id')['sum'].transform('max')
    video_data['owner_like_min'] = video_data.groupby('owner_id')['sum'].transform('min')
    video_data['owner_like_std'] = video_data.groupby('owner_id')['sum'].transform('std')

    video_data['owner_count_sum'] = video_data.groupby('owner_id')['count'].transform('sum')
    video_data['owner_count_mean'] = video_data.groupby('owner_id')['count'].transform('mean')
    video_data['owner_count_max'] = video_data.groupby('owner_id')['count'].transform('max')
    video_data['owner_count_min'] = video_data.groupby('owner_id')['count'].transform('min')
    video_data['owner_count_std'] = video_data.groupby('owner_id')['count'].transform('std')
    
    video_data['ow_duration'] = video_data['duration'] / video_data['owner_mean_duration']
    return video_data

In [9]:
def generate_user_features(user_data):
    user_data['count_city_age'] = user_data.groupby('city_id')['age'].transform('count')
    user_data['mean_city_age'] = user_data.groupby('city_id')['age'].transform('mean')
    user_data['otn_city_age_user_age'] = user_data['age'] / user_data['mean_city_age']
    return user_data
user_data = generate_user_features(user_data)

In [10]:
user_data = user_data.fillna(-1)

In [11]:
user_data

Unnamed: 0,user_id,gender,age,language,city_id,birth_city_id,create_date,count_city_age,mean_city_age,otn_city_age_user_age
0,938,F,57.0,ru,5f7ca800fcb9368f78e3740cb68a4c4ebc62b005cd15cf...,-1,2011-03-11 21:00:00,3680.0,59.280707,0.961527
1,57571,F,68.0,ru,-1,-1,2011-03-11 21:00:00,-1.0,-1.000000,-1.000000
2,50873,F,66.0,ru,a26a3a5b73942ee4af156df272ba2e722ddf4eb50ea396...,-1,2011-03-11 21:00:00,291.0,63.896907,1.032914
3,4335,F,66.0,ru,5f7ca800fcb9368f78e3740cb68a4c4ebc62b005cd15cf...,-1,2011-03-11 21:00:00,3680.0,59.280707,1.113347
4,42138,F,66.0,ru,e1baf026d2d5c938c8ca66bac84c655345b92333a74937...,6b415eabf81e3cc85adac7d323a989a92198e76fe3f053...,2011-03-11 21:00:00,653.0,62.369066,1.058217
...,...,...,...,...,...,...,...,...,...,...
152906,79591,M,17.0,ru,-1,-1,2022-10-10 21:00:00,-1.0,-1.000000,-1.000000
152907,93648,F,101.0,ru,-1,-1,2022-10-10 21:00:00,-1.0,-1.000000,-1.000000
152908,42337,M,37.0,ru,-1,-1,2022-10-10 21:00:00,-1.0,-1.000000,-1.000000
152909,4694,F,78.0,ru,cc15ba5ee579964ee4d5b88d1b7d6e31ca5b3ddda47366...,cc15ba5ee579964ee4d5b88d1b7d6e31ca5b3ddda47366...,2022-10-10 21:00:00,1.0,78.000000,1.000000


In [12]:
video_data = generate_video_owner_features(video_data)

In [13]:
video_data.columns

Index(['video_id', 'owner_id', 'duration', 'upload_timestamp',
       'subscribers_count', 'last_active_date', 'city_id', 'create_date',
       'sum', 'mean', 'count', 'owner_count', 'owner_mean_duration',
       'owner_min_duration', 'owner_max_duration', 'owner_std_duration',
       'video_hour_after_create', 'video_hour_after_last_active',
       'owner_like_sum', 'owner_like_mean', 'owner_like_max', 'owner_like_min',
       'owner_like_std', 'owner_count_sum', 'owner_count_mean',
       'owner_count_max', 'owner_count_min', 'owner_count_std', 'ow_duration'],
      dtype='object')

In [17]:
df = df.merge(user_data)

In [18]:
df = df.merge(video_data,on='video_id')

In [19]:
df.columns

Index(['user_id_per_video_id', 'ranks_als', 'rank_id_als', 'ranks_tf',
       'rank_id_tf', 'video_id', 'user_id', 'label', 'gender', 'age',
       'language', 'city_id_x', 'birth_city_id', 'create_date_x',
       'count_city_age', 'mean_city_age', 'otn_city_age_user_age', 'owner_id',
       'duration', 'upload_timestamp', 'subscribers_count', 'last_active_date',
       'city_id_y', 'create_date_y', 'sum', 'mean', 'count', 'owner_count',
       'owner_mean_duration', 'owner_min_duration', 'owner_max_duration',
       'owner_std_duration', 'video_hour_after_create',
       'video_hour_after_last_active', 'owner_like_sum', 'owner_like_mean',
       'owner_like_max', 'owner_like_min', 'owner_like_std', 'owner_count_sum',
       'owner_count_mean', 'owner_count_max', 'owner_count_min',
       'owner_count_std', 'ow_duration'],
      dtype='object')

In [20]:
def get_df_features(df): 
    df['mean_age_video'] = df.groupby('video_id')['age'].transform('mean')
    df['std_age_video'] = df.groupby('video_id')['age'].transform('std')
    df['birth_city_not_none'] = df['birth_city_id'].apply(lambda x: int(x == -1))
    df['city_not_none'] = df['city_id_x'].apply(lambda x: int(x == -1))
    df['city_rav1'] = df.progress_apply(lambda x: int(x.city_id_x == x.city_id_y and x.city_id_x != -1),axis=1)
    df['city_rav2'] = df.progress_apply(lambda x: int(x.birth_city_id == x.city_id_y and x.city_id_x != -1),axis=1)
    df['city_rav3'] = df.progress_apply(lambda x: int(x.city_id_x == x.birth_city_id and x.city_id_x != -1),axis=1)
    return df

In [75]:
#tqdm.pandas()
#df = get_df_features(df)

In [21]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=0.20, n_splits=2, random_state=56)
split = splitter.split(df, groups=df['user_id'])
train_inds, test_inds = next(split)

train_df = df.iloc[train_inds]
val_df = df.iloc[test_inds]

In [22]:
del df
del val_no_targets
del train
del train_inds
del test_inds

In [23]:
val_df = val_df.sort_values(by='user_id')
train_df = train_df.sort_values(by='user_id')

In [24]:
train_df['user_id'] = train_df['user_id'].astype(int)
val_df['user_id'] = val_df['user_id'].astype(int)

In [25]:
train_df['user_id'] 

0                0
1560413          0
510111           0
2936030          0
2963784          0
             ...  
10699509    152906
1180623     152906
10667633    152906
2298079     152906
11177641    152906
Name: user_id, Length: 12607913, dtype: int64

In [26]:
cat_features = ['gender','language']

train_pool = Pool(data=train_df.drop(['label','user_id_per_video_id','city_id_x','city_id_y','birth_city_id','create_date_x'],axis=1),
                  label=train_df['label'],
                  group_id=train_df['user_id'],
                  cat_features=cat_features)

eval_pool = Pool(data=val_df.drop(['label','user_id_per_video_id','city_id_x','city_id_y','birth_city_id','create_date_x'],axis=1),
                  label=val_df['label'],
                  group_id=val_df['user_id'],
                  cat_features=cat_features)

In [27]:
del train_df
del val_df

In [31]:
params = {'iterations':1000,
          'learning_rate':0.1,
          'loss_function':'YetiRank', #'YetiRank'
          'eval_metric':'AUC', #'AUC'
          'max_depth':5,
          'task_type':'GPU',
          'random_seed':56}

cbm = CatBoostRanker(**params)

In [32]:
cbm.fit(train_pool,eval_set=eval_pool,verbose=50)

Groupwise loss function. OneHotMaxSize set to 10


Default metric period is 5 because AUC, PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7164899	best: 0.7164899 (0)	total: 623ms	remaining: 10m 22s
50:	test: 0.7479761	best: 0.7479761 (50)	total: 17.2s	remaining: 5m 20s
100:	test: 0.7586238	best: 0.7586238 (100)	total: 33.7s	remaining: 5m
150:	test: 0.7657383	best: 0.7657383 (150)	total: 50.2s	remaining: 4m 42s
200:	test: 0.7703261	best: 0.7703261 (200)	total: 1m 6s	remaining: 4m 24s
250:	test: 0.7736883	best: 0.7736883 (250)	total: 1m 22s	remaining: 4m 7s
300:	test: 0.7763650	best: 0.7763650 (300)	total: 1m 39s	remaining: 3m 50s
350:	test: 0.7782614	best: 0.7782614 (350)	total: 1m 55s	remaining: 3m 33s
400:	test: 0.7798646	best: 0.7798646 (400)	total: 2m 11s	remaining: 3m 17s
450:	test: 0.7810817	best: 0.7810817 (450)	total: 2m 28s	remaining: 3m
500:	test: 0.7823409	best: 0.7823409 (500)	total: 2m 44s	remaining: 2m 43s
550:	test: 0.7833691	best: 0.7833691 (550)	total: 3m 1s	remaining: 2m 27s
600:	test: 0.7844497	best: 0.7844497 (600)	total: 3m 17s	remaining: 2m 10s
650:	test: 0.7852430	best: 0.7852430 (650)	to

<catboost.core.CatBoostRanker at 0x7f402014af10>

In [33]:
cbm.save_model('ranker_ensemble.cbm')

In [34]:
### ==|SCORE|==|UPDATE|==
### ==0.7420603408==BASE
###==0.7666423==VideoOwnerFeatures
## ==

In [48]:
scores = cbm.predict(eval_pool)
val_df['scores'] = scores

In [49]:
group_val = val_no_targets.groupby('user_id')['video_id'].agg(lambda x:x.tolist())
group_val = group_val.loc[val_df['user_id'].unique()].sort_index()

In [50]:
group_val

user_id
8         [11769, 14774, 172302, 242673, 220615, 239932,...
20        [148572, 143968, 88959, 151690, 129636, 181198...
27        [217308, 51440, 166825, 221646, 200650, 181713...
44        [207669, 68417, 89139, 113977, 82160, 201397, ...
50        [222069, 246720, 150238, 96686, 50288, 45012, ...
                                ...                        
152869    [2364, 2364, 174749, 174749, 166282, 166282, 2...
152876    [60830, 60830, 53988, 51296, 130049, 219290, 7...
152880    [133327, 116973, 220739, 6053, 164736, 245914,...
152905    [3701, 188589, 188589, 31412, 977, 65611, 2048...
152909    [69651, 201126, 242469, 113564, 44054, 146298,...
Name: video_id, Length: 20000, dtype: object

In [51]:
#group_val = group_val.tolist()
steps = val_df.groupby('user_id')['ranks'].agg('count').sort_index()

In [52]:
val_df

Unnamed: 0,user_id,ranks,video_id,label,gender,age,language,duration,_,sum,mean,count,scores
1593955,8,2.986083,191191,0.0,M,68.0,ru,181,152650,1546,0.161176,9592,-0.985647
6195652,8,4.364005,239707,0.0,M,68.0,ru,348,191436,288,0.138929,2073,-0.461509
6202351,8,4.274849,105431,0.0,M,68.0,ru,11,84201,0,0.000000,612,-0.831370
1226942,8,3.644911,63158,0.0,M,68.0,ru,55,50539,199,0.028303,7031,-1.709086
6205021,8,4.068806,51225,0.0,M,68.0,ru,358,41016,104,0.109705,948,-0.557722
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9413518,152909,1.681226,27060,0.0,M,77.0,ru,232,21682,7,0.189189,37,-1.580587
3367290,152909,1.892947,53929,0.0,M,77.0,ru,32,43163,1189,0.191219,6218,-1.462537
7211464,152909,2.062833,89801,0.0,M,77.0,ru,177,71779,636,0.231105,2752,-0.899935
1038959,152909,2.501467,56158,0.0,M,77.0,ru,182,44949,1988,0.248562,7998,-0.662481


In [53]:
group_val.loc[8]

[11769,
 14774,
 172302,
 242673,
 220615,
 239932,
 94185,
 183606,
 6,
 222701,
 141954,
 108333,
 72103,
 147926,
 189012,
 102474,
 166825,
 251684,
 103809,
 207377,
 93407,
 240571,
 242190]

In [55]:
model_ranks,als_ranks = [],[]
start = 0
for step in tqdm(steps):
    df_ = val_df.iloc[start:start+step]
    start += step
    als_ranks += [df_.sort_values(by='ranks')['video_id'][::-1][:10].tolist()]
    model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]

  0%|          | 0/20000 [00:00<?, ?it/s]

  als_ranks += [df_.sort_values(by='ranks')['video_id'][::-1][:10].tolist()]
  model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]


In [56]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])


In [57]:
mapk(group_val,model_ranks,k=10)

0.0002357142857142857

In [58]:
#### INFERENSE

In [15]:
test_predicts_tf = pd.read_parquet("TFIDF_100_preds_test.parquet")
test_predicts_als = pd.read_parquet("ALS_100_preds_test.parquet")

In [16]:
sample_sub = pd.read_csv('TFIDFV2.csv')['user_id']
test_predicts_tf['user_id'] = sample_sub.values
test_predicts_tf['rank_i'] = test_predicts_tf['preds'].map(lambda x: list(range(len(x))))
test_predicts_als['user_id'] = sample_sub.values
test_predicts_als['rank_i'] = test_predicts_als['preds'].map(lambda x: list(range(len(x))))

In [17]:
df_tf = create_data(test_predicts_tf)

  0%|          | 0/152911 [00:00<?, ?it/s]

In [18]:
df_als = create_data(test_predicts_als)

  0%|          | 0/152911 [00:00<?, ?it/s]

In [19]:
df_tf = df_tf.rename({'ranks':'ranks_tf','rank_id':'rank_id_tf'},axis=1)
df_als = df_als.rename({'ranks':'ranks_als','rank_id':'rank_id_als'},axis=1)

In [26]:
tqdm.pandas()
df_tf['user_id_per_video_id'] = df_tf.progress_apply(lambda x: str(x.video_id) + '_' + str(x.user_id),axis=1)
df_als['user_id_per_video_id'] = df_als.progress_apply(lambda x: str(x.video_id) + '_' + str(x.user_id),axis=1)

  0%|          | 0/15287689 [00:00<?, ?it/s]

  0%|          | 0/15291100 [00:00<?, ?it/s]

In [27]:
df_als = df_als.set_index('user_id_per_video_id')
df_tf = df_tf.set_index('user_id_per_video_id')

In [28]:
df_all = pd.concat([df_als,df_tf],axis=1)

In [29]:
df_all

Unnamed: 0_level_0,user_id,ranks_als,rank_id_als,video_id,user_id,ranks_tf,rank_id_tf,video_id
user_id_per_video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18999.0_938.0,938.0,0.136298,0.0,18999.0,938.0,6.027133,1.0,18999.0
6.0_938.0,938.0,0.127246,1.0,6.0,938.0,6.432595,0.0,6.0
43275.0_938.0,938.0,0.116085,2.0,43275.0,938.0,4.984624,8.0,43275.0
207669.0_938.0,938.0,0.103372,3.0,207669.0,938.0,5.237797,6.0,207669.0
127197.0_938.0,938.0,0.101942,4.0,127197.0,938.0,5.299776,4.0,127197.0
...,...,...,...,...,...,...,...,...
103565.0_25414.0,,,,,25414.0,2.387932,86.0,103565.0
52665.0_25414.0,,,,,25414.0,2.267152,89.0,52665.0
80920.0_25414.0,,,,,25414.0,2.221922,92.0,80920.0
165997.0_25414.0,,,,,25414.0,2.120616,97.0,165997.0


In [30]:
df_all['video_id'] = df_all['video_id'].fillna(-1)
video_id = np.max(df_all['video_id'].values,axis=-1)

In [31]:
df_all['user_id'] = df_all['user_id'].fillna(-1)
users_id = np.max(df_all['user_id'].values,axis=-1)

In [32]:
df_all = df_all.drop(['user_id','video_id'],axis=1)
df_all['video_id'] = video_id
df_all['user_id'] = users_id

In [33]:
df = df_all

In [35]:
df['rank_id_als'] = df['rank_id_als'].fillna(500)
df['rank_id_tf'] = df['rank_id_tf'].fillna(500)
df['ranks_als'] = df['ranks_als'].fillna(-1)
df['ranks_tf'] = df['ranks_tf'].fillna(-1)

In [36]:
df.to_parquet('DF_TEST.parquet.gzip',compression='gzip')

In [14]:
df = pd.read_parquet('DF_TEST.parquet.gzip')

In [15]:
df = df.merge(user_data)
df = df.merge(video_data,on='video_id')

In [16]:
df = df.sort_values(by='user_id')

In [17]:
df['user_id'] = df['user_id'].astype(int)

In [18]:
cat_features = ['gender','language']

predict_pool = Pool(data=df.drop(['city_id_x','city_id_y','birth_city_id','create_date_x'],axis=1),
                    group_id=df['user_id'],
                    cat_features=cat_features)

In [19]:
cbm = CatBoostRanker().load_model('ranker_ensemble.cbm')

In [20]:
df['scores'] = cbm.predict(predict_pool)

In [21]:
steps = df.groupby('user_id')['ranks_tf'].agg('count').sort_index()

In [56]:
df 

Unnamed: 0,user_id,ranks,video_id,gender,age,language,owner_id,duration,upload_timestamp,owner_count,...,owner_like_max,owner_like_min,owner_like_std,owner_count_sum,owner_count_mean,owner_count_max,owner_count_min,owner_count_std,ow_duration,scores
14912622,0,2.013695,12631,F,69.0,ru,1419,40,1697070638033,900,...,43,0,3.225457,13607,15.118889,260,1,26.669157,0.539649,-1.872947
14539730,0,2.006510,100123,F,69.0,ru,958,44,1691955973737,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,1.073780,-1.808733
14539404,0,1.819828,76688,F,69.0,ru,958,29,1687094139758,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,0.707719,-2.082201
14539318,0,2.314240,216068,F,69.0,ru,958,15,1697307412775,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,0.366061,-0.586644
14913163,0,1.926944,246406,F,69.0,ru,1419,29,1698580510370,900,...,43,0,3.225457,13607,15.118889,260,1,26.669157,0.391246,-1.669668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445559,152910,8.227917,34204,F,66.0,ru,1918,103,1694439379819,72,...,4146,0,488.645136,16547,229.819444,15272,1,1801.303240,1.665020,0.507385
10492759,152910,4.813787,201337,F,66.0,ru,7409,55,1695717010802,149,...,227,0,21.549617,10724,71.973154,2993,1,289.629468,0.552112,-1.121911
6733552,152910,5.735305,226671,F,66.0,ru,2871,15,1696522616923,138,...,736,0,104.690196,45401,328.992754,7895,1,984.763201,0.536408,-0.706107
1788719,152910,6.400785,166381,F,66.0,ru,4852,51,1695892816939,23,...,199,0,48.888556,32367,1407.260870,7893,1,2427.233895,0.278490,-1.670684


In [22]:
steps = steps.tolist()

In [23]:
model_ranks,als_ranks = [],[]
start = 0
for step in tqdm(steps):
    df_ = df.iloc[start:start+step]
    start += step
    als_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]
    model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]

  0%|          | 0/152911 [00:00<?, ?it/s]

  als_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]
  model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]


In [24]:
sample_sub = pd.read_csv('TFIDFV2.csv')

In [25]:
sample_sub = sample_sub.sort_values(by='user_id') 

In [35]:
predicted_cols=[' '.join(map(str, [int(x) for x in i])) for i in model_ranks]

In [36]:
sample_sub['recommendation'] = predicted_cols

In [39]:
sample_sub['count_tr'] = sample_sub['recommendation'].map(lambda x: len(x.split()))

In [40]:
sample_sub

Unnamed: 0,user_id,recommendation,count_tr
55869,0,136879 216068 57681 112870 131005 45512 169519...,10
40655,1,131005 63190 34204 169519 182599 24671 166282 ...,10
51407,2,179468 101082 111331 148896 131005 81866 49114...,10
149226,3,57234 98911 252647 131005 169519 233353 240476...,10
19366,4,238658 249740 76442 4266 202971 19081 51821 42...,10
...,...,...,...
13880,152906,63190 26721 131005 24671 166282 89776 182599 4...,10
83396,152907,210711 234752 57234 50148 131005 60094 169519 ...,10
34792,152908,160838 13740 115913 91466 149902 145482 249196...,10
108083,152909,99967 127736 240476 98911 201813 235932 210711...,10


In [65]:
recs = sample_sub['recommendation']
recs[98527] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[98927] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[144672] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[48959] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[98527] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[98927] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[144672] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/p

In [66]:
sample_sub['recommendation'] = recs

In [41]:
sample_sub['count_tr'].mean()

10.0

In [61]:
sample_socres = pd.read_csv('Simple_catboost.csv')
sample_socres

Unnamed: 0,user_id,recommendation
0,0,180158 223373 187951 25005 185565 46747 78973 ...
1,1,131005 34204 169519 166282 54812 137447 51799 ...
2,2,121660 51358 189727 111331 131005 132060 24524...
3,3,123953 137447 207669 38943 169519 166825 24547...
4,4,34204 169519 6986 13009 89274 238556 133062 18...
...,...,...
152906,152906,131005 34204 166282 217384 241470 169519 54812...
152907,152907,131005 169519 137447 34204 188589 166825 77671...
152908,152908,134995 124778 69105 5380 28917 16658 145482 69...
152909,152909,235932 166825 188589 144081 38943 34204 131005...


In [42]:
sample_sub[['user_id','recommendation']].to_csv('Simple_catboost_ensV2.csv',index=False)

In [32]:
sample_sub

Unnamed: 0,user_id,recommendation,count_tr
55869,0,136879.0 216068.0 57681.0 112870.0 131005.0 45...,10
40655,1,131005.0 63190.0 34204.0 169519.0 182599.0 246...,10
51407,2,179468.0 101082.0 111331.0 148896.0 131005.0 8...,10
149226,3,57234.0 98911.0 252647.0 131005.0 169519.0 233...,10
19366,4,238658.0 249740.0 76442.0 4266.0 202971.0 1908...,10
...,...,...,...
13880,152906,63190.0 26721.0 131005.0 24671.0 166282.0 8977...,10
83396,152907,210711.0 234752.0 57234.0 50148.0 131005.0 600...,10
34792,152908,160838.0 13740.0 115913.0 91466.0 149902.0 145...,10
108083,152909,99967.0 127736.0 240476.0 98911.0 201813.0 235...,10
