In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
from tqdm.contrib import tzip
from sklearn.model_selection import train_test_split
from catboost import CatBoostRanker,Pool,cv,CatBoostRegressor, CatBoostClassifier
import gc

In [2]:
train_df = pd.read_csv('./train.csv',parse_dates=['timestamp'])
user_data = pd.read_csv('user.csv')
video_data = pd.read_csv('video.csv')
owner_data = pd.read_csv('owner.csv')

In [3]:
val_predicts = pd.read_parquet("TFIDF_preds_val.parquet")

In [4]:
def chrono_split(
        df: pd.DataFrame, 
        split_by_column: str = 'user_id', 
        ratio: float = 0.7, 
        col_timestamp: str = 'timestamp'):

    df = df.sort_values([split_by_column, col_timestamp])
    groups = df.groupby(split_by_column)

    df["count"] = groups[split_by_column].transform("count")
    df["rank_s"] = groups.cumcount() + 1

    ratio = [ratio, 1 - ratio]
    splits = []
    prev_threshold = None
    for threshold in np.cumsum(ratio):
        condition = df["rank_s"] <= round(threshold * df["count"])
        if prev_threshold is not None:
            condition &= df["rank_s"] > round(prev_threshold * df["count"])
        splits.append(df[condition].drop(["rank_s", "count"], axis=1))
        prev_threshold = threshold

    return splits

def train_val_split(
        train_df: pd.DataFrame, 
        val_users_n: int = 200_000):
    
    user_ids = train_df['user_id'].unique()
    user_ids_val = random.sample(list(user_ids), val_users_n)
    condition = train_df['user_id'].isin(user_ids_val)

    val = train_df[condition]
    val_no_targets, val_targets = chrono_split(val, ratio=0.7)

    train = pd.concat([train_df[~condition], val_no_targets]).sort_values('timestamp')
    return train, val_no_targets, val_targets

In [5]:
random.seed(56)
train, val_no_targets, val_targets = train_val_split(train_df,val_users_n=100_000)

In [6]:
val_group = val_targets.groupby('user_id')
act = val_group['video_id'].agg(lambda x:x.tolist()).tolist()
val_users = val_group.agg(lambda x:x.tolist()).index.tolist()
val_predicts['user_id'] = val_users
val_predicts['rank_i'] = val_predicts['preds'].map(lambda x: list(range(len(x))))

In [7]:
def create_data(predicts):
    ranks = []
    video_id = []
    user_id = []
    rank_id = []
    for idx in tqdm(predicts.index):
        id = predicts.loc[idx]['user_id']
        for pred,rank in zip(predicts['preds'][idx],predicts['scores'][idx]):
            ranks.append(rank)
            video_id.append(pred)
            user_id.append(id)
    
    df = pd.DataFrame()
    df['user_id'] = user_id
    df['ranks'] = ranks
    df['video_id'] = video_id
    
    return df

In [8]:
df = create_data(val_predicts)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [9]:
val_targets['video_id_liked'] = val_targets.apply(lambda x: x.video_id if x.interaction_type == 'like' else -1,axis=1)

In [10]:
val_targets_base = val_targets.groupby('user_id')['video_id'].agg(lambda x:x.tolist())
val_targets_like = val_targets.groupby('user_id')['video_id_liked'].agg(lambda x:x.tolist())

In [11]:
val_targets_like = val_targets_like.map(set)
val_targets_base = val_targets_base.map(set)

In [12]:
def set_label(df,val_targets_like,val_targets_base):
    labels = []
    for user,item in tzip(df['user_id'],df['video_id']):
        if item in val_targets_like[user]:
            labels += [1]
        elif item in val_targets_base[user]:
            labels += [0.1]
        else:
            labels += [0]
    df['label'] = labels
    return df

In [None]:
df = set_label(df,val_targets_like,val_targets_base)

  0%|          | 0/9995539 [00:00<?, ?it/s]

In [None]:
def get_video_like_watching_features(df):
    df['is_like'] = df['interaction_type'].map(lambda x:1 if x == 'like' else 0)
    features = df.groupby('video_id')['is_like'].agg(['sum','mean','count'])
    features['_'] = range(features.shape[0])
    features['video_id'] = features.index
    features = features.set_index('_')
    return features

video_like_faeatures = get_video_like_watching_features(train)

In [None]:
video_data = video_data.merge(owner_data)

In [None]:
video_data = video_data.merge(video_like_faeatures)

In [None]:
def generate_video_owner_features(video_data):
    video_data['owner_count'] = video_data.groupby('owner_id')['video_id'].transform('count')
    video_data['owner_mean_duration'] = video_data.groupby('owner_id')['duration'].transform('mean')
    video_data['owner_min_duration'] = video_data.groupby('owner_id')['duration'].transform('min')
    video_data['owner_max_duration'] = video_data.groupby('owner_id')['duration'].transform('max')
    video_data['owner_std_duration'] = video_data.groupby('owner_id')['duration'].transform('std')
    video_data['video_hour_after_create'] = (video_data['upload_timestamp'] - video_data['create_date']) / 3600
    video_data['video_hour_after_last_active'] = (video_data['upload_timestamp'] - video_data['last_active_date']) / 3600
    
    video_data['owner_like_sum'] = video_data.groupby('owner_id')['sum'].transform('sum')
    video_data['owner_like_mean'] = video_data.groupby('owner_id')['sum'].transform('mean')
    video_data['owner_like_max'] = video_data.groupby('owner_id')['sum'].transform('max')
    video_data['owner_like_min'] = video_data.groupby('owner_id')['sum'].transform('min')
    video_data['owner_like_std'] = video_data.groupby('owner_id')['sum'].transform('std')

    video_data['owner_count_sum'] = video_data.groupby('owner_id')['count'].transform('sum')
    video_data['owner_count_mean'] = video_data.groupby('owner_id')['count'].transform('mean')
    video_data['owner_count_max'] = video_data.groupby('owner_id')['count'].transform('max')
    video_data['owner_count_min'] = video_data.groupby('owner_id')['count'].transform('min')
    video_data['owner_count_std'] = video_data.groupby('owner_id')['count'].transform('std')
    
    video_data['ow_duration'] = video_data['duration'] / video_data['owner_mean_duration']
    return video_data

In [None]:
video_data = generate_video_owner_features(video_data)

In [None]:
video_data.columns

In [None]:
df = df.merge(user_data[['user_id','gender','age','language']])

In [None]:
df = df.merge(video_data)

In [None]:
#df = df.merge(video_like_faeatures.reset_index())

In [None]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=0.20, n_splits=2, random_state=56)
split = splitter.split(df, groups=df['user_id'])
train_inds, test_inds = next(split)

train_df = df.iloc[train_inds]
val_df = df.iloc[test_inds]

In [None]:
val_df = val_df.sort_values(by='user_id')
train_df = train_df.sort_values(by='user_id')

In [None]:
cat_features = ['gender','language']

train_pool = Pool(data=train_df.drop(['label','city_id'],axis=1),
                  label=train_df['label'],
                  group_id=train_df['user_id'],
                  cat_features=cat_features)

eval_pool = Pool(data=val_df.drop(['label','city_id'],axis=1),
                  label=val_df['label'],
                  group_id=val_df['user_id'],
                  cat_features=cat_features)

In [41]:
params = {'iterations':1000,
          'learning_rate':0.1,
          'loss_function':'RMSE', #'YetiRank'
          'eval_metric':'RMSE', #'AUC'
          'max_depth':6,
          'task_type':'GPU',
          'random_seed':56}

cbm = CatBoostRegressor(**params)

In [42]:
cbm.fit(train_pool,eval_set=eval_pool,verbose=50)

Groupwise loss function. OneHotMaxSize set to 10


Default metric period is 5 because AUC, PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6997108	best: 0.6997108 (0)	total: 345ms	remaining: 5m 44s
50:	test: 0.7401503	best: 0.7401503 (50)	total: 10.8s	remaining: 3m 21s
100:	test: 0.7526741	best: 0.7526741 (100)	total: 21.3s	remaining: 3m 9s
150:	test: 0.7593498	best: 0.7593498 (150)	total: 31.8s	remaining: 2m 58s
200:	test: 0.7631979	best: 0.7631979 (200)	total: 42.3s	remaining: 2m 48s
250:	test: 0.7651100	best: 0.7651100 (250)	total: 52.8s	remaining: 2m 37s
300:	test: 0.7659756	best: 0.7659756 (300)	total: 1m 3s	remaining: 2m 27s
350:	test: 0.7663346	best: 0.7663346 (350)	total: 1m 13s	remaining: 2m 16s
400:	test: 0.7665125	best: 0.7665125 (400)	total: 1m 24s	remaining: 2m 5s
450:	test: 0.7666344	best: 0.7666415 (445)	total: 1m 34s	remaining: 1m 55s
500:	test: 0.7665552	best: 0.7666423 (480)	total: 1m 45s	remaining: 1m 44s
550:	test: 0.7665145	best: 0.7666423 (480)	total: 1m 55s	remaining: 1m 34s
600:	test: 0.7664960	best: 0.7666423 (480)	total: 2m 6s	remaining: 1m 23s
650:	test: 0.7664590	best: 0.7666423 (480

<catboost.core.CatBoostRanker at 0x7f104d74bd30>

In [None]:
### ==|SCORE|==|UPDATE|==
### ==0.7420603408==BASE
###==0.7666423==VideoOwnerFeatures

In [48]:
scores = cbm.predict(eval_pool)
val_df['scores'] = scores

In [49]:
group_val = val_no_targets.groupby('user_id')['video_id'].agg(lambda x:x.tolist())
group_val = group_val.loc[val_df['user_id'].unique()].sort_index()

In [50]:
group_val

user_id
8         [11769, 14774, 172302, 242673, 220615, 239932,...
20        [148572, 143968, 88959, 151690, 129636, 181198...
27        [217308, 51440, 166825, 221646, 200650, 181713...
44        [207669, 68417, 89139, 113977, 82160, 201397, ...
50        [222069, 246720, 150238, 96686, 50288, 45012, ...
                                ...                        
152869    [2364, 2364, 174749, 174749, 166282, 166282, 2...
152876    [60830, 60830, 53988, 51296, 130049, 219290, 7...
152880    [133327, 116973, 220739, 6053, 164736, 245914,...
152905    [3701, 188589, 188589, 31412, 977, 65611, 2048...
152909    [69651, 201126, 242469, 113564, 44054, 146298,...
Name: video_id, Length: 20000, dtype: object

In [51]:
#group_val = group_val.tolist()
steps = val_df.groupby('user_id')['ranks'].agg('count').sort_index()

In [52]:
val_df

Unnamed: 0,user_id,ranks,video_id,label,gender,age,language,duration,_,sum,mean,count,scores
1593955,8,2.986083,191191,0.0,M,68.0,ru,181,152650,1546,0.161176,9592,-0.985647
6195652,8,4.364005,239707,0.0,M,68.0,ru,348,191436,288,0.138929,2073,-0.461509
6202351,8,4.274849,105431,0.0,M,68.0,ru,11,84201,0,0.000000,612,-0.831370
1226942,8,3.644911,63158,0.0,M,68.0,ru,55,50539,199,0.028303,7031,-1.709086
6205021,8,4.068806,51225,0.0,M,68.0,ru,358,41016,104,0.109705,948,-0.557722
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9413518,152909,1.681226,27060,0.0,M,77.0,ru,232,21682,7,0.189189,37,-1.580587
3367290,152909,1.892947,53929,0.0,M,77.0,ru,32,43163,1189,0.191219,6218,-1.462537
7211464,152909,2.062833,89801,0.0,M,77.0,ru,177,71779,636,0.231105,2752,-0.899935
1038959,152909,2.501467,56158,0.0,M,77.0,ru,182,44949,1988,0.248562,7998,-0.662481


In [53]:
group_val.loc[8]

[11769,
 14774,
 172302,
 242673,
 220615,
 239932,
 94185,
 183606,
 6,
 222701,
 141954,
 108333,
 72103,
 147926,
 189012,
 102474,
 166825,
 251684,
 103809,
 207377,
 93407,
 240571,
 242190]

In [55]:
model_ranks,als_ranks = [],[]
start = 0
for step in tqdm(steps):
    df_ = val_df.iloc[start:start+step]
    start += step
    als_ranks += [df_.sort_values(by='ranks')['video_id'][::-1][:10].tolist()]
    model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]

  0%|          | 0/20000 [00:00<?, ?it/s]

  als_ranks += [df_.sort_values(by='ranks')['video_id'][::-1][:10].tolist()]
  model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]


In [56]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])


In [57]:
mapk(group_val,model_ranks,k=10)

0.0002357142857142857

In [58]:
#### INFERENSE

In [43]:
test_predicts = pd.read_parquet("TFIDF_preds_test.parquet")

In [44]:
sample_sub = pd.read_csv('TFIDFV2.csv')['user_id']
test_predicts['user_id'] = sample_sub.tolist()

In [45]:
df = create_data(test_predicts)

  0%|          | 0/152911 [00:00<?, ?it/s]

In [49]:
df

Unnamed: 0,user_id,ranks,video_id
0,938,7.127657,38943
1,938,6.951231,207669
2,938,6.607752,6
3,938,6.366234,18999
4,938,6.137554,137366
...,...,...,...
15287684,25414,5.495050,57234
15287685,25414,5.478948,117032
15287686,25414,5.473680,91451
15287687,25414,5.441230,176399


In [50]:
df = df.merge(user_data[['user_id','gender','age','language']])
df = df.merge(video_data)

In [51]:
df = df.sort_values(by='user_id')

In [52]:
predict_pool = Pool(data=df.drop(['city_id'],axis=1),
                    group_id=df['user_id'],
                    cat_features=cat_features)

In [53]:
df['scores'] = cbm.predict(predict_pool)

In [54]:
df

Unnamed: 0,user_id,ranks,video_id,gender,age,language,owner_id,duration,upload_timestamp,owner_count,...,owner_like_max,owner_like_min,owner_like_std,owner_count_sum,owner_count_mean,owner_count_max,owner_count_min,owner_count_std,ow_duration,scores
14912622,0,2.013695,12631,F,69.0,ru,1419,40,1697070638033,900,...,43,0,3.225457,13607,15.118889,260,1,26.669157,0.539649,-1.872947
14539730,0,2.006510,100123,F,69.0,ru,958,44,1691955973737,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,1.073780,-1.808733
14539404,0,1.819828,76688,F,69.0,ru,958,29,1687094139758,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,0.707719,-2.082201
14539318,0,2.314240,216068,F,69.0,ru,958,15,1697307412775,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,0.366061,-0.586644
14913163,0,1.926944,246406,F,69.0,ru,1419,29,1698580510370,900,...,43,0,3.225457,13607,15.118889,260,1,26.669157,0.391246,-1.669668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445559,152910,8.227917,34204,F,66.0,ru,1918,103,1694439379819,72,...,4146,0,488.645136,16547,229.819444,15272,1,1801.303240,1.665020,0.507385
10492759,152910,4.813787,201337,F,66.0,ru,7409,55,1695717010802,149,...,227,0,21.549617,10724,71.973154,2993,1,289.629468,0.552112,-1.121911
6733552,152910,5.735305,226671,F,66.0,ru,2871,15,1696522616923,138,...,736,0,104.690196,45401,328.992754,7895,1,984.763201,0.536408,-0.706107
1788719,152910,6.400785,166381,F,66.0,ru,4852,51,1695892816939,23,...,199,0,48.888556,32367,1407.260870,7893,1,2427.233895,0.278490,-1.670684


In [55]:
steps = df.groupby('user_id')['ranks'].agg('count').sort_index()

In [56]:
df

Unnamed: 0,user_id,ranks,video_id,gender,age,language,owner_id,duration,upload_timestamp,owner_count,...,owner_like_max,owner_like_min,owner_like_std,owner_count_sum,owner_count_mean,owner_count_max,owner_count_min,owner_count_std,ow_duration,scores
14912622,0,2.013695,12631,F,69.0,ru,1419,40,1697070638033,900,...,43,0,3.225457,13607,15.118889,260,1,26.669157,0.539649,-1.872947
14539730,0,2.006510,100123,F,69.0,ru,958,44,1691955973737,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,1.073780,-1.808733
14539404,0,1.819828,76688,F,69.0,ru,958,29,1687094139758,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,0.707719,-2.082201
14539318,0,2.314240,216068,F,69.0,ru,958,15,1697307412775,86,...,33,0,5.572210,5540,64.418605,674,1,140.054062,0.366061,-0.586644
14913163,0,1.926944,246406,F,69.0,ru,1419,29,1698580510370,900,...,43,0,3.225457,13607,15.118889,260,1,26.669157,0.391246,-1.669668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445559,152910,8.227917,34204,F,66.0,ru,1918,103,1694439379819,72,...,4146,0,488.645136,16547,229.819444,15272,1,1801.303240,1.665020,0.507385
10492759,152910,4.813787,201337,F,66.0,ru,7409,55,1695717010802,149,...,227,0,21.549617,10724,71.973154,2993,1,289.629468,0.552112,-1.121911
6733552,152910,5.735305,226671,F,66.0,ru,2871,15,1696522616923,138,...,736,0,104.690196,45401,328.992754,7895,1,984.763201,0.536408,-0.706107
1788719,152910,6.400785,166381,F,66.0,ru,4852,51,1695892816939,23,...,199,0,48.888556,32367,1407.260870,7893,1,2427.233895,0.278490,-1.670684


In [57]:
steps = steps.tolist()

In [58]:
model_ranks,als_ranks = [],[]
start = 0
for step in tqdm(steps):
    df_ = df.iloc[start:start+step]
    start += step
    als_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]
    model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]

  0%|          | 0/152911 [00:00<?, ?it/s]

  als_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]
  model_ranks += [df_.sort_values(by='scores')['video_id'][::-1][:10].tolist()]


In [59]:
sample_sub = pd.read_csv('TFIDFV2.csv')

In [60]:
sample_sub = sample_sub.sort_values(by='user_id')

In [61]:
predicted_cols=[' '.join(map(str, i)) for i in model_ranks]

In [62]:
sample_sub['recommendation'] = predicted_cols

In [68]:
sample_sub['count_tr'] = sample_sub['recommendation'].map(lambda x: len(x.split()))

In [64]:
sample_sub[sample_sub['count_tr'] != 10]

Unnamed: 0,user_id,recommendation,count_tr
48959,20954,229566 35632 239035 103098 242750 169983 195428,7
98527,22025,252568 222751 16701 74601 180173 143427,6
144672,55148,149457 137219 79672 47517 141781 251129 24127,7
98927,78973,29784 120362,2


In [65]:
recs = sample_sub['recommendation']
recs[98527] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[98927] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[144672] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[48959] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[98527] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[98927] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[144672] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/p

In [66]:
sample_sub['recommendation'] = recs

In [69]:
sample_sub['count_tr'].mean()

10.0

In [61]:
sample_socres = pd.read_csv('Simple_catboost.csv')
sample_socres

Unnamed: 0,user_id,recommendation
0,0,180158 223373 187951 25005 185565 46747 78973 ...
1,1,131005 34204 169519 166282 54812 137447 51799 ...
2,2,121660 51358 189727 111331 131005 132060 24524...
3,3,123953 137447 207669 38943 169519 166825 24547...
4,4,34204 169519 6986 13009 89274 238556 133062 18...
...,...,...
152906,152906,131005 34204 166282 217384 241470 169519 54812...
152907,152907,131005 169519 137447 34204 188589 166825 77671...
152908,152908,134995 124778 69105 5380 28917 16658 145482 69...
152909,152909,235932 166825 188589 144081 38943 34204 131005...


In [71]:
sample_sub[['user_id','recommendation']].to_csv('Simple_catboostV4.csv',index=False)

In [70]:
sample_sub

Unnamed: 0,user_id,recommendation,count_tr
55869,0,112870 17524 136525 139933 54741 50973 131005 ...,10
40655,1,131005 63190 24671 57234 26721 34204 166282 16...,10
51407,2,101082 179468 148896 81866 111331 131005 57234...,10
149226,3,57234 240476 131005 169519 34204 137366 252647...,10
19366,4,8687 202971 76442 249740 238658 6986 86252 246...,10
...,...,...,...
13880,152906,57234 63190 26721 24671 131005 91466 166282 18...,10
83396,152907,210711 101082 50148 57234 240476 706 131005 60...,10
34792,152908,160838 91466 13740 115913 149902 249196 145482...,10
108083,152909,127736 99967 210711 240476 101082 235932 50148...,10
