In [1]:
import pandas as pd
import catboost as cb

In [2]:
# Чтение feed_data из csv файла
feed_data = pd.read_csv(r'C:\Users\Alex\Desktop\data_for_training.csv')
print(f"Feed data shape: {feed_data.shape}")

Feed data shape: (15378483, 51)


In [3]:
# rename post_id_x to post_id 
feed_data = feed_data.rename(columns={'post_id_x': 'post_id'})

# drop post_id_y
feed_data = feed_data.drop('post_id_y', axis=1)

# drop action
feed_data = feed_data.drop('action', axis=1)

In [4]:
columns_to_remove = ['user_views', 'user_likes',
                     'component_2', '4_exp_group_views', '0_exp_group_likes', '6_exp_group_likes', 
                     '1_exp_group_views', '2_exp_group_views', 'exp_group', '0_exp_group_views', 
                     'component_4', '5_exp_group_views', '6_exp_group_views', 'component_10', 
                     'component_5', 'avg_word_length', 'component_1', 'component_3', 'component_6', 
                     'component_7', 'component_8', 'word_count', 'component_9', 'day_of_week', 
                     'sentence_count', 'punctuation_count', 'day_of_month', 'os', 
                     'source', 'year','1_exp_group_likes', '2_exp_group_likes',
                     '3_exp_group_likes', '3_exp_group_views', '4_exp_group_likes',
                     '5_exp_group_likes' , 'time_since_last_action']

feed_data = feed_data.drop(columns=columns_to_remove)

In [5]:
def sample_10_percent(group):
    frac = 0.2
    return group.sample(frac=frac)

feed_data = feed_data.groupby('user_id', group_keys=False).apply(sample_10_percent)

In [6]:
feed_data.columns

Index(['user_id', 'post_id', 'target', 'gender', 'age', 'country', 'city',
       'topic', 'month', 'hour_of_day', 'post_views', 'post_likes'],
      dtype='object')

In [7]:
cat_features  = ['gender', 'age', 'country', 'city', 'exp_group', 'os', 'source', 'topic']

In [8]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool

# Get a list of unique users
unique_users = feed_data['user_id'].unique()

# Split unique users into training and testing sets
train_users, test_users = train_test_split(unique_users, test_size=0.5, random_state=42)

X_train = feed_data[feed_data['user_id'].isin(train_users)]
X_train = X_train.drop('target', axis=1)

X_test = feed_data[feed_data['user_id'].isin(test_users)]
X_test = X_test.drop('target', axis=1)

y_train = feed_data[feed_data['user_id'].isin(train_users)]['target']
y_test = feed_data[feed_data['user_id'].isin(test_users)]['target']

# Create a group ID based on the 'user_id' column in the training set
group_id_dict_train = {user_id: idx for idx, user_id in enumerate(train_users)}
X_train['group_id'] = X_train['user_id'].map(group_id_dict_train)

# Create a group ID based on the 'user_id' column in the test set
group_id_dict_test = {user_id: idx for idx, user_id in enumerate(test_users)}
X_test['group_id'] = X_test['user_id'].map(group_id_dict_test)

# Sort the train and test sets by 'group_id'
X_train = X_train.sort_values(by='group_id')
y_train = y_train.loc[X_train.index]

X_test = X_test.sort_values(by='group_id')
y_test = y_test.loc[X_test.index]

# Create train and test Pool objects with the 'group_id' column
train_pool = Pool(X_train.drop(columns=['user_id']), y_train, group_id=X_train['group_id'])
test_pool = Pool(X_test.drop(columns=['user_id']), y_test, group_id=X_test['group_id'])


In [9]:
# Specify the training parameters 
params = {'loss_function': 'Logloss', # suitable for binary classification
          'custom_metric': ['AUC', 'NDCG'], # evaluation metrics
          'thread_count': 16,
          'verbose': 100,
          'random_seed': 42,
          'iterations': 1000,
          'learning_rate': 0.01,
          }

# Initialize CatBoostClassifier
model = CatBoostClassifier(**params)

# Fit model
model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.6842021	test: 0.6842544	best: 0.6842544 (0)	total: 576ms	remaining: 9m 35s
100:	learn: 0.3627769	test: 0.3627011	best: 0.3627011 (100)	total: 41.9s	remaining: 6m 12s
200:	learn: 0.3303082	test: 0.3302401	best: 0.3302401 (200)	total: 1m 22s	remaining: 5m 27s
300:	learn: 0.3251785	test: 0.3251125	best: 0.3251125 (300)	total: 2m 3s	remaining: 4m 47s
400:	learn: 0.3236567	test: 0.3235858	best: 0.3235858 (400)	total: 2m 44s	remaining: 4m 5s
500:	learn: 0.3229098	test: 0.3228395	best: 0.3228395 (500)	total: 3m 24s	remaining: 3m 23s
600:	learn: 0.3224383	test: 0.3223708	best: 0.3223708 (600)	total: 4m 4s	remaining: 2m 42s
700:	learn: 0.3220757	test: 0.3220118	best: 0.3220118 (700)	total: 4m 44s	remaining: 2m 1s
800:	learn: 0.3217645	test: 0.3217019	best: 0.3217019 (800)	total: 5m 24s	remaining: 1m 20s
900:	learn: 0.3215188	test: 0.3214573	best: 0.3214573 (900)	total: 6m 4s	remaining: 40.1s
999:	learn: 0.3212975	test: 0.3212390	best: 0.3212390 (999)	total: 6m 44s	remaining: 0us

be

<catboost.core.CatBoostClassifier at 0x16d7f367c10>

In [10]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.drop(columns=['user_id']).columns

for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f'{name}: {score}')

age: 37.73608477896087
post_views: 17.37096487032175
post_likes: 14.827768820765382
city: 10.76735901325584
month: 6.931971983839355
country: 6.195213257891395
hour_of_day: 3.077188188443453
gender: 1.3535355344578608
post_id: 0.9229312087558498
topic: 0.81206534004215
group_id: 0.0049170032660802426


In [11]:
y_pred_proba = model.predict_proba(test_pool)[:, 1]
# Add the prediction probabilities to the test dataset
X_test['pred_proba'] = y_pred_proba

# Group by 'user_id' and find the top 5 predicted 'post_id' for each user
top_5_posts = X_test.groupby('user_id').apply(lambda x: x.nlargest(5, 'pred_proba')['post_id'])

In [12]:
top_5_posts

user_id          
200      39          6697
         37          6404
         25          6934
         30          6661
         23          5989
                     ... 
168552   15378472    3130
         15378473    5400
         15378474    6024
         15378466    6323
         15378471      49
Name: post_id, Length: 408015, dtype: int64

### HitRate@5 Evaluation Metric for Recommender System

This Python function calculates the HitRate@5 for a recommender system. HitRate@5 is a commonly used metric for evaluating the performance of a recommender system. It specifically measures the proportion of instances where at least one of the top 5 recommended items was liked by the user.

In [15]:
def hitrate_at_k(X_test, y_test, top_k_posts, k=5):
    hits = 0
    total_checks = 0

    # Get unique user_ids from the test set
    user_ids = X_test['user_id'].unique()

    for user_id in user_ids:
        # Get the list of top 5 predicted post_ids for this user
        top_k_pred = list(top_k_posts.loc[user_id])

        # Get the actual liked post_ids for this user
        true_post_ids = list(X_test[(X_test['user_id'] == user_id) & (y_test == 1)]['post_id'])

        # Increase the count of total checks
        total_checks += 1

        # If at least one of the top 5 recommended posts is liked by the user
        if set(top_k_pred) & set(true_post_ids):
            hits += 1

    # Calculate the hit rate as the proportion of checks where at least one of the top 5 recommended posts was liked
    hitrate_at_k = hits / total_checks if total_checks > 0 else 0

    return hitrate_at_k

# Use the function to calculate the HitRate@5
hitrate_at_5 = hitrate_at_k(X_test, y_test, top_5_posts, k=5)
print("HitRate@5:", hitrate_at_5)


HitRate@5: 0.6300993836010931


In [16]:
# Save the model to a file
model.save_model("catboost_model_1.cbm")

## Upload the data to SQL Server

In [7]:
inference_data = feed_data.drop('target', axis=1)

In [8]:
inference_data.to_csv(r'C:\Users\Alex\Desktop\data_for_inference2.csv', index=False)

In [9]:
from tqdm import tqdm
import math
import pandas as pd
from sqlalchemy import create_engine
import time

def upload_dataframe_in_chunks(data, table_name, engine, chunksize=10000):
    total_chunks = math.ceil(len(data) / chunksize)
    for i in tqdm(range(total_chunks), desc=f"Uploading to {table_name}"):
        chunk = data[i * chunksize : (i + 1) * chunksize]
        if_exists = "replace" if i == 0 else "append"
        chunk.to_sql(table_name, con=engine, if_exists=if_exists, index=False, method="multi")


engine = create_engine(
    "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
    "postgres.lab.karpov.courses:6432/startml"
)
chunksize = 100000
upload_dataframe_in_chunks(inference_data, "a-efimik_features_lesson_22_4", engine, chunksize=chunksize)

Uploading to a-efimik_features_lesson_22_4: 100%|██████████| 31/31 [26:31<00:00, 51.33s/it]
