In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
### Забираем данные из таблиц

user_data = pd.read_sql(
    """SELECT * FROM public.user_data""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

post_text_df = pd.read_sql(
    """SELECT * FROM public.post_text_df""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

feed_data = pd.read_sql(
    """SELECT * FROM public.feed_data LIMIT 1000000""",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
)

In [3]:
### Подготовим колонки к слиянию

user_data.rename(columns = {'id':'user_id'}, inplace = True)
post_text_df.rename(columns = {'id':'post_id'}, inplace = True)

In [4]:
### Мержим все таблицы

df = pd.merge(
    feed_data,
    user_data,
    on='user_id',
    how='left'
)

df = pd.merge(
    df,
    post_text_df,
    on='post_id',
    how='left'
)

In [5]:
### Избавимся от записей о лайках, поставленных после просмотра поста

df = df[df['action']=='view']
df = df.drop('action', axis=1)

In [6]:
### Применим One-Hot encoding для колонок с количеством категорий < 10

object_cols = list(df.drop('text', axis=1).select_dtypes(include='object').columns)

def one_hot_encoder(X):

    X = X.copy()

    for col in object_cols:
            if X[col].nunique() < 10:
                X = pd.concat([X, pd.get_dummies(X[col], drop_first=True)], axis=1)
                X = X.drop([col], axis=1)
                object_cols.remove(col)

    return X

df = one_hot_encoder(df)

In [7]:
df

Unnamed: 0,timestamp,user_id,post_id,target,gender,age,country,city,exp_group,source,text,iOS,covid,entertainment,movie,politics,sport,tech
0,2021-12-20 10:32:26,159995,3845,1,0,14,Ukraine,Makiivka,1,organic,Its airborne transmission.\n..I knew it all al...,False,True,False,False,False,False,False
2,2021-12-20 10:33:16,159995,5151,0,0,14,Ukraine,Makiivka,1,organic,Yeah i saw the rough cuts. The unedited sex sc...,False,False,False,True,False,False,False
3,2021-12-20 10:36:15,159995,4196,0,0,14,Ukraine,Makiivka,1,organic,So the government puts a bill before the house...,False,True,False,False,False,False,False
4,2021-12-20 10:37:39,159995,4829,1,0,14,Ukraine,Makiivka,1,organic,Actor Paxton made his directorial debut with t...,False,False,False,True,False,False,False
6,2021-12-26 22:18:48,159995,1551,1,0,14,Ukraine,Makiivka,1,organic,Wasps 31-37 Leicester\n\nLeicester withstood a...,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999993,2021-11-04 23:12:42,62698,1416,0,1,23,Russia,Torzhok,0,ads,Solskjaer raises hopes of return\n\nManchester...,True,False,False,False,False,True,False
999994,2021-11-04 23:14:04,62698,1296,1,1,23,Russia,Torzhok,0,ads,Lib Dems bold election policy\n\nCharles Kenne...,True,False,False,False,True,False,False
999996,2021-11-04 23:16:05,62698,1360,0,1,23,Russia,Torzhok,0,ads,MPs quiz aides over royal income\n\nSenior off...,True,False,False,False,True,False,False
999997,2021-11-04 23:17:14,62698,1473,0,1,23,Russia,Torzhok,0,ads,Henry tipped for Fifa award\n\nFifa president ...,True,False,False,False,False,True,False


In [8]:
### Обработаем некоторые признаки

def datetime_features(X):
    
    X = X.copy()
    X = X.sort_values('timestamp')

    ### Извлечем признаки из timestamp

    X['timestamp'] = pd.to_datetime(X['timestamp'])
    X['day_of_week'] = X.timestamp.dt.dayofweek
    X['hour'] = X.timestamp.dt.hour
    X = X.drop('timestamp', axis=1)

    return X

df = datetime_features(df)

In [9]:
df['text_size'] = df['text'].apply(lambda x: len(x))

In [10]:
### Разделим выборку на train-test

split = - df.shape[0] // 5

train = df.iloc[:split].copy()
test = df.iloc[split:].copy()

In [12]:
train

Unnamed: 0,user_id,post_id,target,gender,age,country,city,exp_group,source,text,iOS,covid,entertainment,movie,politics,sport,tech,day_of_week,hour,text_size
26383,140692,7124,0,1,14,Russia,Novomoskovsk,2,organic,"I saw this movie a few years ago, and man I ne...",True,False,False,True,False,False,False,4,6,779
826954,160124,6747,0,1,48,Russia,Barnaul,3,organic,"I grew up in Baltimore, so I was exposed to th...",False,False,False,True,False,False,False,4,6,914
107478,153321,4165,0,1,21,Russia,Razdol’noye,4,organic,People should be allowed to deal with this pan...,False,True,False,False,False,False,False,4,6,140
993272,50080,875,1,0,21,Russia,Uchaly,1,ads,Dutch watch Van Goghs last film\n\nThe last fi...,False,False,True,False,False,False,False,4,6,1489
26384,140692,4652,1,1,14,Russia,Novomoskovsk,2,organic,"1st watched 2/9/2008, 4 out of 10(Dir-J.S. Car...",True,False,False,True,False,False,False,4,6,1894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816781,127245,4284,0,0,17,Russia,Yekaterinburg,1,organic,David Chases The Sopranos is perhaps the most ...,True,False,False,True,False,False,False,6,9,3087
747362,140846,1915,0,0,25,Russia,Kizel,3,organic,Melzer shocks Agassi in San Jose\n\nSecond see...,False,False,False,False,False,True,False,6,9,1130
37845,140698,6359,0,0,26,Russia,Khabarovsk,4,organic,This isnt one of those reviews about poor spec...,True,False,False,True,False,False,False,6,9,1083
217237,112639,187,0,1,35,Russia,Orenburg,4,organic,Pernod takeover talk lifts Domecq\n\nShares in...,True,False,False,False,False,False,False,6,9,1562


In [11]:
### Mean-Target encoding с шумом для оставшихся категориальных колонок, посчитанный на трейне

def mean_target_encoder(X):
    
    X = X.copy()
    
    dict_of_means = dict()

    for col in object_cols:
        y_mean = X.groupby(col)['target'].mean()
        noise = 0.006 * np.random.rand(y_mean.shape[0])
        dict_of_means[col] = y_mean + noise

    return dict_of_means

dict_of_means = mean_target_encoder(train)

# for col in object_cols:
#     train[col] = train[col].map(dict_of_means[col])
#     test[col] = test[col].map(dict_of_means[col])

In [13]:
### Target encoder with prior smoothing.
### Источник: https://towardsdatascience.com/dealing-with-categorical-variables-by-using-target-encoder-a0f1733a4c69

def mean_encoding_smooth(df, col, target='target'):
    
    stats = df['target'].groupby(df[col]).agg(['count', 'mean'])

    smoothing_factor = 1.0 # The f of the smoothing factor equation 
    min_samples_leaf = 1 # The k of the smoothing factor equation
    prior = df['target'].mean()
    smoove = 1 / (1 + np.exp(-(stats['count'] - min_samples_leaf) / smoothing_factor))
    smoothing = prior * (1 - smoove) + stats['mean'] * smoove
    encoded = pd.Series(smoothing, name = f'{col}_encoded_complete')

    return encoded

for col in object_cols:
    encoded = mean_encoding_smooth(train, col, target='target')

    train[col] = train[col].map(encoded)
    test[col] = test[col].map(encoded)

In [15]:
train

Unnamed: 0,user_id,post_id,target,gender,age,country,city,exp_group,source,text,iOS,covid,entertainment,movie,politics,sport,tech,day_of_week,hour,text_size
26383,140692,7124,0,1,14,0.107236,0.046252,2,0.114241,"I saw this movie a few years ago, and man I ne...",True,False,False,True,False,False,False,4,6,779
826954,160124,6747,0,1,48,0.107236,0.138969,3,0.114241,"I grew up in Baltimore, so I was exposed to th...",False,False,False,True,False,False,False,4,6,914
107478,153321,4165,0,1,21,0.107236,0.049020,4,0.114241,People should be allowed to deal with this pan...,False,True,False,False,False,False,False,4,6,140
993272,50080,875,1,0,21,0.107236,0.145600,1,0.115372,Dutch watch Van Goghs last film\n\nThe last fi...,False,False,True,False,False,False,False,4,6,1489
26384,140692,4652,1,1,14,0.107236,0.046252,2,0.114241,"1st watched 2/9/2008, 4 out of 10(Dir-J.S. Car...",True,False,False,True,False,False,False,4,6,1894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816781,127245,4284,0,0,17,0.107236,0.088866,1,0.114241,David Chases The Sopranos is perhaps the most ...,True,False,False,True,False,False,False,6,9,3087
747362,140846,1915,0,0,25,0.107236,0.121896,3,0.114241,Melzer shocks Agassi in San Jose\n\nSecond see...,False,False,False,False,False,True,False,6,9,1130
37845,140698,6359,0,0,26,0.107236,0.154683,4,0.114241,This isnt one of those reviews about poor spec...,True,False,False,True,False,False,False,6,9,1083
217237,112639,187,0,1,35,0.107236,0.122285,4,0.114241,Pernod takeover talk lifts Domecq\n\nShares in...,True,False,False,False,False,False,False,6,9,1562


In [14]:
df

Unnamed: 0,user_id,post_id,target,gender,age,country,city,exp_group,source,text,iOS,covid,entertainment,movie,politics,sport,tech,day_of_week,hour,text_size
26383,140692,7124,0,1,14,Russia,Novomoskovsk,2,organic,"I saw this movie a few years ago, and man I ne...",True,False,False,True,False,False,False,4,6,779
826954,160124,6747,0,1,48,Russia,Barnaul,3,organic,"I grew up in Baltimore, so I was exposed to th...",False,False,False,True,False,False,False,4,6,914
107478,153321,4165,0,1,21,Russia,Razdol’noye,4,organic,People should be allowed to deal with this pan...,False,True,False,False,False,False,False,4,6,140
993272,50080,875,1,0,21,Russia,Uchaly,1,ads,Dutch watch Van Goghs last film\n\nThe last fi...,False,False,True,False,False,False,False,4,6,1489
26384,140692,4652,1,1,14,Russia,Novomoskovsk,2,organic,"1st watched 2/9/2008, 4 out of 10(Dir-J.S. Car...",True,False,False,True,False,False,False,4,6,1894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866973,140879,3923,0,1,36,Russia,Oktyabr’skiy,1,organic,@Alyssa_Milano You know what can stop fascism?...,True,True,False,False,False,False,False,2,23,140
268147,134078,5647,0,1,38,Russia,Kemerovo,4,organic,"This movie is a pathetic attempt, apparently, ...",True,False,False,True,False,False,False,2,23,1100
866974,140879,1264,0,1,36,Russia,Oktyabr’skiy,1,organic,Terror detainees win Lords appeal\n\nDetaining...,True,False,False,False,True,False,False,2,23,3531
268148,134078,3927,0,1,38,Russia,Kemerovo,4,organic,Handwashing continues to play a huge role in s...,True,True,False,False,False,False,False,2,23,139


In [724]:
### Заполним возможные пропуски в категориях популярным значением

test = test.fillna(train[object_cols].mode().iloc[0])

In [571]:
### Закодируем текстовые колонки  

# tfidf = TfidfVectorizer()
# tfidf.fit(train['text']) 

# def text_encoder(X):

#     X = X.copy()
    
#     text = pd.DataFrame(tfidf.transform(X['text']).T.todense(),
#                         index=tfidf.vocabulary_.keys())
#     X['text'] = text.mean().values

#     return X

# train = text_encoder(train)
# test = text_encoder(test)

### Съедает много памяти при больших размерах таблиц

train = train.drop('text', axis=1)
test = test.drop('text', axis=1)

In [736]:
X_train, y_train = train.drop('target', axis=1), train['target']
X_test, y_test = test.drop('target', axis=1), test['target']

In [737]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()

model.fit(X_train, y_train)

Learning rate set to 0.170469
0:	learn: 0.5684457	total: 101ms	remaining: 1m 41s
1:	learn: 0.4886434	total: 181ms	remaining: 1m 30s
2:	learn: 0.4389208	total: 272ms	remaining: 1m 30s
3:	learn: 0.4072028	total: 362ms	remaining: 1m 30s
4:	learn: 0.3856849	total: 447ms	remaining: 1m 28s
5:	learn: 0.3717225	total: 527ms	remaining: 1m 27s
6:	learn: 0.3624868	total: 610ms	remaining: 1m 26s
7:	learn: 0.3567080	total: 689ms	remaining: 1m 25s
8:	learn: 0.3523991	total: 769ms	remaining: 1m 24s
9:	learn: 0.3494457	total: 858ms	remaining: 1m 24s
10:	learn: 0.3472780	total: 941ms	remaining: 1m 24s
11:	learn: 0.3459392	total: 1.04s	remaining: 1m 25s
12:	learn: 0.3438523	total: 1.13s	remaining: 1m 25s
13:	learn: 0.3432417	total: 1.21s	remaining: 1m 25s
14:	learn: 0.3422273	total: 1.3s	remaining: 1m 25s
15:	learn: 0.3414070	total: 1.38s	remaining: 1m 24s
16:	learn: 0.3410748	total: 1.46s	remaining: 1m 24s
17:	learn: 0.3407452	total: 1.54s	remaining: 1m 24s
18:	learn: 0.3404500	total: 1.63s	remaining: 

<catboost.core.CatBoostClassifier at 0x171cd8f56a0>

In [738]:
### Склеим матрицы с предсказанными вероятностями для дальнейшей оценки

train2 = train.copy()
test2 = test.copy()

train2['proba'] = model.predict_proba(X_train)[:, 1]
test2['proba'] = model.predict_proba(X_test)[:, 1]

In [739]:
### HitRate@k для одного пользователя (функция заимствована из ответа саппорта в дискорде)

def hit_rate_at_k(actual, predicted,k):
    if len(actual) == 0:
        return np.nan
    hits = 0
    for i in range(min(k, len(predicted))):
        if predicted[i] in actual:
            hits += 1
    
    return hits / len(actual)

In [740]:
### Среднее HitRate@5 по всем пользователям

def HitRate5(X):

    hit_rate_list = []

    for user in X['user_id'].unique():
        actual = X[(X['user_id']==user) & (X['target']==1)]['post_id'].values
        predicted = X[(X['user_id']==user)].sort_values(by='proba', ascending=False).head(5)['post_id'].values
        if len(actual) > 0:
            hit_rate_list.append(hit_rate_at_k(actual, predicted, 5))

    return np.mean(hit_rate_list)

In [741]:
HitRate5(train2)

0.070128867427394

In [742]:
HitRate5(test2)

0.105770306768307