In [1]:
import pandas as pd
import catboost as cb

In [2]:
# Чтение feed_data из csv файла
feed_data = pd.read_csv(r'C:\Users\Alex\Desktop\data_for_training.csv')
print(f"Feed data shape: {feed_data.shape}")

Feed data shape: (15378483, 51)


In [3]:
feed_data.columns

Index(['user_id', 'post_id_x', 'action', 'target', 'gender', 'age', 'country',
       'city', 'exp_group', 'os', 'source', 'topic', 'word_count',
       'sentence_count', 'avg_word_length', 'punctuation_count', 'month',
       'day_of_week', 'hour_of_day', 'time_since_last_action', 'day_of_month',
       'year', 'user_views', 'user_likes', 'post_views', 'post_likes',
       '0_exp_group_likes', '0_exp_group_views', '1_exp_group_likes',
       '1_exp_group_views', '2_exp_group_likes', '2_exp_group_views',
       '3_exp_group_likes', '3_exp_group_views', '4_exp_group_likes',
       '4_exp_group_views', '5_exp_group_likes', '5_exp_group_views',
       '6_exp_group_likes', '6_exp_group_views', 'component_1', 'component_2',
       'component_3', 'component_4', 'component_5', 'component_6',
       'component_7', 'component_8', 'component_9', 'component_10',
       'post_id_y'],
      dtype='object')

In [4]:
cat_features  = ['gender', 'age', 'country', 'city', 'exp_group', 'os', 'source', 'topic']

In [9]:
def sample_10_percent(group):
    frac = 0.5
    return group.sample(frac=frac)

feed_data = feed_data.groupby('user_id', group_keys=False).apply(sample_10_percent)

In [10]:
# split the data into train and test
from sklearn.model_selection import train_test_split

# data with selected features, top k with mutual information, without data leakage, timestamp, 'action' and 'text'
X = feed_data.drop(['action', 'target'], axis=1)
y = feed_data['target']

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create a group ID based on the 'user_id' column
unique_user_ids = X_train['user_id'].unique()
group_id_dict = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
X_train['group_id'] = X_train['user_id'].map(group_id_dict)
X_test['group_id'] = X_test['user_id'].map(group_id_dict)

# Sort the train and test sets by 'group_id'
X_train = X_train.sort_values(by='group_id')
y_train = y_train.loc[X_train.index]

X_test = X_test.sort_values(by='group_id')
y_test = y_test.loc[X_test.index]

# Create train and test Pool objects with the 'group_id' column
from catboost import Pool

train_pool = Pool(X_train.drop(columns=['user_id']), y_train, group_id=X_train['group_id'])
test_pool = Pool(X_test.drop(columns=['user_id']), y_test, group_id=X_test['group_id'])

In [12]:
# Train the CatBoost model using PrecisionAt:top=5 evaluation metric
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='PrecisionAt:top=5',
                           eval_metric='PrecisionAt:top=5',
                           random_seed=42,
                           verbose=100)

model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.1175552	test: 0.1034550	best: 0.1034550 (0)	total: 669ms	remaining: 11m 8s
100:	learn: 0.1697975	test: 0.1353045	best: 0.1353095 (99)	total: 45.1s	remaining: 6m 41s
200:	learn: 0.1708416	test: 0.1355491	best: 0.1355897 (197)	total: 1m 28s	remaining: 5m 51s
300:	learn: 0.1721442	test: 0.1356892	best: 0.1357224 (250)	total: 2m 11s	remaining: 5m 6s
400:	learn: 0.1735314	test: 0.1357740	best: 0.1357777 (398)	total: 2m 55s	remaining: 4m 22s
500:	learn: 0.1746147	test: 0.1356843	best: 0.1358281 (431)	total: 3m 40s	remaining: 3m 39s
600:	learn: 0.1760130	test: 0.1357814	best: 0.1358281 (431)	total: 4m 25s	remaining: 2m 56s
700:	learn: 0.1774541	test: 0.1357310	best: 0.1358367 (661)	total: 5m 10s	remaining: 2m 12s
800:	learn: 0.1787592	test: 0.1357482	best: 0.1358367 (661)	total: 5m 54s	remaining: 1m 28s
900:	learn: 0.1798928	test: 0.1357298	best: 0.1358367 (661)	total: 6m 39s	remaining: 43.9s
999:	learn: 0.1809405	test: 0.1357187	best: 0.1358367 (661)	total: 7m 24s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1ef65cf7820>

In [15]:
y_pred_proba = model.predict_proba(test_pool)[:, 1]
# Add the prediction probabilities to the test dataset
X_test['pred_proba'] = y_pred_proba

# Group by 'user_id' and find the top 5 predicted 'post_id' for each user
top_5_posts = X_test.groupby('user_id').apply(lambda x: x.nlargest(5, 'pred_proba')['post_id_x'])

In [16]:
top_5_posts

user_id          
200      63          6919
         45          4438
         0           5057
         41          1864
         31          1150
                     ... 
168551   15378364    1390
         15378350    2831
         15378343    6978
168552   15378468    3205
         15378439    5246
Name: post_id_x, Length: 763361, dtype: int64

In [18]:
def hitrate_at_k(top_k_posts, k=5):
    hits = 0
    user_data = top_k_posts.groupby(level=0).apply(list).to_dict()

    for user_id, top_k_posts in user_data.items():
        true_post_ids = X_test[(X_test['user_id'] == user_id) & (y_test == 1)]['post_id_x']
        hits += len(set(top_k_posts) & set(true_post_ids))

    return hits / len(y_test[y_test == 1])

hitrate_at_5 = hitrate_at_k(top_5_posts, k=5)
print("HitRate@5:", hitrate_at_5)


HitRate@5: 0.6447881156963307


In [None]:
# Save the model to a file
model.save_model("catboost_model.cbm")