In [3]:
import numpy as np
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
import lightgbm as lgb
from tqdm.auto import tqdm
from catboost.text_processing import Tokenizer
from scipy.stats import ttest_ind, levene, shapiro
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from functools import partial 
from sklearn.base import BaseEstimator
from tqdm.auto import trange
import gc
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold

In [4]:
train_data = pd.read_parquet("/kaggle/input/erutube/train.parquet")
test_data = pd.read_parquet("/kaggle/input/erutube/test.parquet")
video_stat = pd.read_parquet("/kaggle/input/erutube/video_stat.parquet")
sample_sub = pd.read_csv('/kaggle/input/erutube/sample.csv')

In [5]:
train_data = train_data.merge(video_stat,on='video_id')
test_data = test_data.merge(video_stat,on='video_id')

In [6]:
train_data['event_timestamp'] = pd.to_datetime(train_data['event_timestamp']).dt.tz_localize(None)
test_data['event_timestamp'] = pd.to_datetime(test_data['event_timestamp']).dt.tz_localize(None)

train_data['v_pub_datetime'] = pd.to_datetime(train_data['v_pub_datetime']).dt.tz_localize(None)
test_data['v_pub_datetime'] = pd.to_datetime(test_data['v_pub_datetime']).dt.tz_localize(None)

In [7]:
train_data['user_cnt'] = train_data[['user_id','v_pub_datetime']].groupby('user_id')['v_pub_datetime'].transform('count')
train_data = train_data.sort_values(by=['user_id','event_timestamp'])

In [8]:
train_data['index'] = range(len(train_data))
train_data['index'] -= train_data[['user_id','index']].groupby('user_id')['index'].transform('min')
train_idxes = (train_data['index'] >= 9)  & (train_data['user_cnt'] > 10) & (train_data['user_cnt'] < 20)

In [9]:
feature_data = train_data[~train_idxes]
feature_data['target'] = -100
feature_data.loc[(feature_data['v_duration'] <= 300),'target'] = (feature_data['watchtime'] > 30).astype(int)
feature_data.loc[(feature_data['v_duration'] > 300),'target'] = (feature_data['watchtime'] > feature_data['v_duration'] * 0.25).astype(int)

video_stat = feature_data[['video_id','target']].groupby('video_id')['target'].agg('mean').to_dict()
author_stat = feature_data[['author_id','target']].groupby('author_id')['target'].agg('mean').to_dict()
user_stat = feature_data[['user_id','target']].groupby('user_id')['target'].agg('mean').to_dict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['target'] = -100


In [10]:
video_stat_dur = feature_data[['video_id','watchtime']].groupby('video_id')['watchtime'].agg('mean').to_dict()
author_stat_dur = feature_data[['author_id','watchtime']].groupby('author_id')['watchtime'].agg('mean').to_dict()
user_stat_dur = feature_data[['user_id','watchtime']].groupby('user_id')['watchtime'].agg('mean').to_dict()

In [11]:
author_mean_features = feature_data[
    ['author_id','v_cr_click_long_view_7_days',
     'v_cr_click_long_view_30_days']
].groupby('author_id').agg(['mean','std']).reset_index()

In [12]:
train_data['user_author'] = train_data['user_id'] + '@' + train_data['author_id']
feature_data['user_author'] = feature_data['user_id'] + '@' + feature_data['author_id']
user_author_stat = feature_data[['user_author','target']].groupby('user_author')['target'].agg('mean')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_data['user_author'] = feature_data['user_id'] + '@' + feature_data['author_id']


In [13]:
train_data = train_data.drop(['index'],axis=1)
train_data = train_data[train_idxes]

In [14]:
train_data['labels'] = -100
train_data.loc[(train_data['v_duration'] <= 300),'labels'] = (train_data['watchtime'] > 30).astype(int)
train_data.loc[(train_data['v_duration'] > 300),'labels'] = (train_data['watchtime'] > train_data['v_duration'] * 0.25).astype(int)
test_data['labels'] = -100

all_data = pd.concat([train_data, test_data],axis=0,ignore_index=True)
del train_data, test_data
gc.collect()

0

In [15]:
all_data['mean_video'] = all_data['video_id'].map(video_stat)
all_data['mean_user'] = all_data['user_id'].map(user_stat)
all_data['mean_author'] = all_data['author_id'].map(author_stat)

all_data['mean_video_dur'] = all_data['video_id'].map(video_stat_dur)
all_data['mean_user_dur'] = all_data['user_id'].map(user_stat_dur)
all_data['mean_author_dur'] = all_data['author_id'].map(author_stat_dur)

In [16]:
all_data['user_author_mean'] = all_data['user_author'].map(user_author_stat)

In [17]:
all_data = all_data.drop(['user_author'],axis=1)

In [18]:
all_data['user_id'] = all_data['user_id'].astype('category').cat.codes
all_data['region'] = all_data['region'].astype('category').cat.codes
all_data['city'] = all_data['city'].astype('category').cat.codes
all_data['video_id'] = all_data['video_id'].astype('category').cat.codes
all_data['author_id'] = all_data['author_id'].astype('category').cat.codes
gc.collect()

0

In [21]:
all_data = all_data.sort_values(by=['user_id','event_timestamp'])

In [24]:
all_data['index'] = range(len(all_data))
all_data['index'] -= all_data[['user_id','index']].groupby('user_id')['index'].transform('min')

In [57]:
all_data['shift_time'] = all_data['event_timestamp'].copy()
all_data['shift_time'] = (all_data['event_timestamp'].shift(-1) - all_data['shift_time']).dt.seconds
last_idxes = all_data[['user_id','index']].groupby('user_id').transform('max')
all_data.loc[all_data['index'] == last_idxes['index'], 'shift_time'] = None

In [63]:
all_data['video_dur_per_shift'] = all_data['v_duration'] / all_data['shift_time'] 
all_data['video_dur_del_shift'] = all_data['shift_time'] - all_data['v_duration'

In [72]:
def get_time_features(data):
    data['hour'] = data['event_timestamp'].dt.hour
    data['delta_time'] = (data['event_timestamp'] - data['v_pub_datetime']).dt.days
    data['time_session'] = (data['event_timestamp'] - data[['event_timestamp','user_id']].groupby('user_id')['event_timestamp'].transform('min')).dt.seconds
    return data

all_data = get_time_features(all_data)

In [73]:
def get_city_features(data):
    data['city_cnt'] = data[['city','event_timestamp']].groupby('city').transform('count')['event_timestamp']
    data['reg_cnt'] = data[['region','event_timestamp']].groupby('region').transform('count')['event_timestamp']
    gc.collect()
    return data

all_data = get_city_features(all_data)

In [74]:
def get_user_video_features(data):
    data['user_cnt'] = data[['user_id','event_timestamp']].groupby('user_id').transform('count')['event_timestamp']
    data['video_cnt'] = data[['video_id','event_timestamp']].groupby('video_id').transform('count')['event_timestamp']
    data['user_video_cnt'] = data[['user_id','video_id','event_timestamp']].groupby(['video_id','user_id']).transform('count')['event_timestamp']
    data['user_author_cnt'] = data[['user_id','author_id','event_timestamp']].groupby(['user_id','author_id']).transform('count')['event_timestamp']
    data['author_cnt'] = data[['author_id','event_timestamp']].groupby('author_id').transform('count')['event_timestamp']
    data['otn_user_author_cnt'] = data['user_author_cnt'] / data['user_cnt']
    gc.collect()
    return data

all_data = get_user_video_features(all_data)

In [75]:
def get_text_features(data):
    tqdm.pandas()
    data['title_len'] = data['title'].progress_apply(lambda x: len(x.split()))
    data['is_serial'] = data['title'].progress_apply(lambda x: x.lower().count('серия'))
    data['is_tv'] = data['title'].progress_apply(lambda x: x.lower().count('прямой эфир'))
    data['is_digit'] = data['title'].progress_apply(lambda x: any(x.count(i) for i in '0123456789'))
    data['description_len'] = data['description'].progress_apply(lambda x: len(x.split()))
    return data

all_data = get_text_features(all_data)

  0%|          | 0/1820212 [00:00<?, ?it/s]

  0%|          | 0/1820212 [00:00<?, ?it/s]

  0%|          | 0/1820212 [00:00<?, ?it/s]

  0%|          | 0/1820212 [00:00<?, ?it/s]

  0%|          | 0/1820212 [00:00<?, ?it/s]

In [76]:
params = {
    'iterations': 3000,
    'learning_rate': 0.05,
    'loss_function': 'Logloss',
    'max_depth': 5,
    'eval_metric':'F1',
    
    #'dictionaries': [
    #        'Word:min_token_occurrence=5',
    #        'BiGram:gram_order=2'
    #],
    'text_processing': [
        'NaiveBayes+Word|BoW+Word,BiGram|BM25+Word',
    ],
    #'auto_class_weights': 'Balanced',
    'task_type': 'GPU',
    'random_seed': 56
}

target_col = 'labels'#'watchtime'
drop_cols = [
    'event_timestamp',
    'user_id',
    'region',
    'city',
    'video_id',
    'v_pub_datetime',
    'title',
    'description',
    'author_id',
    #'labels'
    'watchtime'
]

cat_features = [
    'category_id'
]

text_features = None

In [77]:
train_df, test_df = train_test_split(all_data[all_data['labels'] != -100],test_size=0.1,random_state=5656,stratify=all_data[all_data['labels'] != -100]['labels'])

In [78]:
train_pool = Pool(
    data = train_df.drop(drop_cols+[target_col],axis=1),
    label = train_df['labels'],
    cat_features = cat_features,
    text_features=text_features
)

eval_pool = Pool(
    data = test_df.drop(drop_cols+[target_col],axis=1),
    label = test_df['labels'],
    cat_features = cat_features,
    text_features=text_features
)

In [79]:
cbm = CatBoostClassifier(**params)
cbm.fit(train_pool, eval_set=eval_pool, verbose=100)

0:	learn: 0.8711495	test: 0.8698864	best: 0.8698864 (0)	total: 7.53s	remaining: 6h 16m 33s
100:	learn: 0.8765284	test: 0.8752321	best: 0.8752986 (95)	total: 9.62s	remaining: 4m 36s
200:	learn: 0.8781354	test: 0.8766650	best: 0.8767663 (198)	total: 11.7s	remaining: 2m 43s
300:	learn: 0.8789053	test: 0.8769944	best: 0.8769944 (300)	total: 13.8s	remaining: 2m 3s
400:	learn: 0.8797949	test: 0.8775005	best: 0.8775351 (397)	total: 15.9s	remaining: 1m 42s
500:	learn: 0.8803789	test: 0.8777098	best: 0.8778375 (487)	total: 17.9s	remaining: 1m 29s
600:	learn: 0.8807665	test: 0.8780361	best: 0.8781274 (598)	total: 20s	remaining: 1m 19s
700:	learn: 0.8811084	test: 0.8780158	best: 0.8782120 (663)	total: 22s	remaining: 1m 12s
800:	learn: 0.8814731	test: 0.8781486	best: 0.8783212 (724)	total: 24.1s	remaining: 1m 6s
900:	learn: 0.8818864	test: 0.8780759	best: 0.8783212 (724)	total: 26.1s	remaining: 1m
1000:	learn: 0.8822304	test: 0.8778524	best: 0.8783212 (724)	total: 28.2s	remaining: 56.2s
1100:	lear

<catboost.core.CatBoostClassifier at 0x7ac95d98fcd0>

In [80]:
cbm.get_feature_importance(prettified=True)[:50]

Unnamed: 0,Feature Id,Importances
0,video_dur_per_shift,49.042013
1,video_dur_del_shift,9.313396
2,shift_time,7.506966
3,mean_video,6.385268
4,v_duration,3.819842
5,mean_user,3.080424
6,mean_user_dur,1.841142
7,user_video_cnt,1.779305
8,video_cnt,1.764867
9,v_cr_click_long_view_7_days,1.7123


In [81]:
test_pool = Pool(
    data = all_data[all_data['labels'] == -100].drop(drop_cols,axis=1),
    cat_features = cat_features,
    text_features = text_features
)

In [82]:
preds_pr = cbm.predict_proba(test_pool)[:,1]

In [83]:
sample_sub['target'] = [int(x >= 0.5) for x in preds_pr]

In [84]:
sample_sub.to_csv('vseros_subv32.csv',index=False)

In [85]:
all_data

Unnamed: 0,event_timestamp,user_id,region,city,video_id,watchtime,v_pub_datetime,v_total_comments,v_year_views,v_month_views,...,video_cnt,user_video_cnt,user_author_cnt,author_cnt,otn_user_author_cnt,title_len,is_serial,is_tv,is_digit,description_len
486080,2024-08-10 11:45:52,0,394,1120,71905,,2022-12-30 12:36:32,23,55123,3822,...,378,1,10,55477,1.0,10,1,0,True,43
486081,2024-08-10 11:53:05,0,394,1120,124062,,2022-12-30 12:36:32,6,40791,3636,...,342,1,10,55477,1.0,8,1,0,True,37
486082,2024-08-10 12:00:16,0,394,1120,36605,,2022-12-30 12:36:32,1,46662,8114,...,427,1,10,55477,1.0,11,1,0,True,64
486083,2024-08-10 12:07:24,0,394,1120,37436,,2022-12-30 12:36:32,2,44122,3548,...,428,1,10,55477,1.0,6,1,0,True,45
486084,2024-08-10 12:14:35,0,394,1120,75591,,2022-12-30 12:36:32,1,39655,2601,...,392,1,10,55477,1.0,7,1,0,True,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486075,2024-08-10 21:29:27,240505,228,1109,112889,974.0,2022-08-25 10:54:50,8,10162,3104,...,289,1,7,10297,1.0,9,0,0,False,0
486076,2024-08-10 21:49:51,240505,228,1109,162011,350.0,2022-08-25 10:53:13,9,8384,4348,...,292,2,7,10297,1.0,12,0,0,False,0
486077,2024-08-10 21:55:54,240505,228,1109,82160,481.0,2022-08-25 10:54:50,8,21185,13981,...,1021,2,7,10297,1.0,7,0,0,False,0
486078,2024-08-10 22:05:14,240505,228,1109,35788,190.0,2022-08-25 10:54:50,8,8319,2548,...,225,1,7,10297,1.0,8,0,0,False,0
