Тут мы реализуем сервис записи фичей в новую таблицу для того чтобы не загружать основной алгоритм обработкой данных

# Загрузка фичей из базы данных

In [3]:
import pandas as pd
from sqlalchemy import create_engine


def load_and_merge_data(engine, chunksize=200000):
    # Чтение данных таблицы user_data
    query = "SELECT * FROM user_data"
    user_data = pd.read_sql(query, engine)
    print(f"User data shape: {user_data.shape}")

    # Чтение данных таблицы post_text_df
    query = "SELECT * FROM post_text_df"
    post_text_df = pd.read_sql(query, engine)
    print(f"Post text data shape: {post_text_df.shape}")

    # Чтение ограниченного количества данных таблицы feed_data
    query = f"SELECT * FROM feed_data"
    feed_data = batch_load_sql_timed(engine, query, chunksize)
    print(f"Feed data shape: {feed_data.shape}")

    # Переименование столбцов идентификаторов
    user_data = user_data.rename(columns={'id': 'user_id'})
    post_text_df = post_text_df.rename(columns={'id': 'post_id'})

    # Объединение таблиц
    data = feed_data.merge(user_data, on='user_id', how='left')
    data = data.merge(post_text_df, on='post_id', how='left')

    print(f"Data shape after load_and_merge_data: {data.shape}")

    return data


def batch_load_sql(engine, query: str, chunksize: int) -> pd.DataFrame:
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=chunksize):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

import time

def batch_load_sql_timed(engine, query: str, chunksize: int) -> pd.DataFrame:
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    row_count = 0
    start_time = time.time()

    for chunk_dataframe in pd.read_sql(query, conn, chunksize=chunksize):
        chunks.append(chunk_dataframe)
        row_count += len(chunk_dataframe)
        print(f"Loaded {row_count} rows, elapsed time: {time.time() - start_time:.2f} seconds")

    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [4]:
import pandas as pd
from sqlalchemy import create_engine
import time

In [5]:
engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )

chunksize = 1000000

In [6]:
# Чтение данных таблицы user_data
query = "SELECT * FROM user_data"
user_data = pd.read_sql(query, engine)
print(f"User data shape: {user_data.shape}")

User data shape: (163205, 8)


In [7]:
# Чтение данных таблицы post_text_df
query = "SELECT * FROM post_text_df"
post_text_df = pd.read_sql(query, engine)
print(f"Post text data shape: {post_text_df.shape}")

Post text data shape: (7023, 3)


In [8]:
import time

def batch_load_sql_timed(engine, query: str, chunksize: int) -> pd.DataFrame:
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    row_count = 0
    start_time = time.time()

    for chunk_dataframe in pd.read_sql(query, conn, chunksize=chunksize):
        chunks.append(chunk_dataframe)
        row_count += len(chunk_dataframe)
        print(f"Loaded {row_count} rows, elapsed time: {time.time() - start_time:.2f} seconds")

    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [9]:
# Чтение ограниченного количества данных таблицы feed_data
query = "SELECT * FROM feed_data"
feed_data = batch_load_sql_timed(engine, query, chunksize)
print(f"Feed data shape: {feed_data.shape}")

Loaded 1000000 rows, elapsed time: 19.23 seconds
Loaded 2000000 rows, elapsed time: 38.49 seconds
Loaded 3000000 rows, elapsed time: 53.43 seconds
Loaded 4000000 rows, elapsed time: 74.20 seconds
Loaded 5000000 rows, elapsed time: 94.05 seconds
Loaded 6000000 rows, elapsed time: 109.64 seconds
Loaded 7000000 rows, elapsed time: 124.32 seconds
Loaded 8000000 rows, elapsed time: 142.20 seconds
Loaded 9000000 rows, elapsed time: 158.91 seconds
Loaded 10000000 rows, elapsed time: 178.88 seconds
Loaded 11000000 rows, elapsed time: 198.42 seconds
Loaded 12000000 rows, elapsed time: 214.21 seconds
Loaded 13000000 rows, elapsed time: 231.56 seconds
Loaded 14000000 rows, elapsed time: 251.92 seconds
Loaded 15000000 rows, elapsed time: 269.51 seconds
Loaded 16000000 rows, elapsed time: 290.21 seconds
Loaded 17000000 rows, elapsed time: 310.99 seconds
Loaded 18000000 rows, elapsed time: 331.12 seconds
Loaded 19000000 rows, elapsed time: 350.60 seconds
Loaded 20000000 rows, elapsed time: 371.48 se

In [None]:
# Чтение feed_data из csv файла
feed_data = pd.read_csv('feed_data.csv')

In [10]:
#save data to csv
import os

desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop_path, "feed_data.csv")

feed_data.to_csv(file_path, index=False)


In [12]:
# Переименование столбцов идентификаторов
user_data = user_data.rename(columns={'id': 'user_id'})
post_text_df = post_text_df.rename(columns={'id': 'post_id'})

# Объединение таблиц
data = feed_data.merge(user_data, on='user_id', how='left')
data = data.merge(post_text_df, on='post_id', how='left')

print(f"Data shape after load_and_merge_data: {data.shape}")

Data shape after load_and_merge_data: (76892800, 14)


In [15]:
# print all data columns 
print(data.columns)

Index(['timestamp', 'user_id', 'post_id', 'action', 'target', 'gender', 'age',
       'country', 'city', 'exp_group', 'os', 'source', 'text', 'topic'],
      dtype='object')


In [18]:
def sample_10_percent(group):
    frac = 0.2
    return group.sample(frac=frac)

sampled_df = data.groupby('user_id', group_keys=False).apply(sample_10_percent)


In [17]:
# Подсчитайте количество строк для каждого пользователя в исходном dataframe
original_counts = data.groupby('user_id').size().reset_index(name='original_count')

# Подсчитайте количество строк для каждого пользователя в выборочном dataframe
sampled_counts = sampled_df.groupby('user_id').size().reset_index(name='sampled_count')

# Объедините два dataframe с подсчетом на столбце `user_id` для сравнения подсчета
counts_comparison = pd.merge(original_counts, sampled_counts, on='user_id')

# Рассчитайте процент строк в выборочном dataframe для каждого пользователя
counts_comparison['percentage'] = (counts_comparison['sampled_count'] / counts_comparison['original_count']) * 100

# Проверьте, равен ли процент около 10% для каждого пользователя
print(counts_comparison)

        user_id  original_count  sampled_count  percentage
0           200             401             40    9.975062
1           201             748             75   10.026738
2           202             724             72    9.944751
3           203             382             38    9.947644
4           204             161             16    9.937888
...         ...             ...            ...         ...
163200   168548             382             38    9.947644
163201   168549             274             27    9.854015
163202   168550             407             41   10.073710
163203   168551             525             52    9.904762
163204   168552             263             26    9.885932

[163205 rows x 4 columns]


# Обработка временных меток

adding year and

In [33]:
# Преобразование формата временных меток в объект datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Извлечение признаков из временных меток
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour_of_day'] = data['timestamp'].dt.hour

# Расчет времени с момента последнего действия для каждого пользователя
data = data.sort_values(['user_id', 'timestamp'])
data['time_since_last_action'] = data.groupby('user_id')['timestamp'].diff().dt.total_seconds()
data['time_since_last_action'].fillna(0, inplace=True)

# Extracting day of the month and year from the timestamp
data['day_of_month'] = data['timestamp'].dt.day
data['year'] = data['timestamp'].dt.year

# Удаление столбца временных меток
data = data.drop('timestamp', axis=1)

print('Timestamps processed')
print(f"Data shape after timestamps processing: {data.shape}")

Timestamps processed
Data shape after timestamps processing: (76892800, 18)


# Создание дополнительных признаков

In [None]:
# Feature 1: Количество просмотров и лайков для каждого пользователя
user_views_likes = data.groupby('user_id')['action'].value_counts().unstack().fillna(0)
user_views_likes.columns = ['user_views', 'user_likes']
data = data.merge(user_views_likes, on='user_id', how='left')

# Feature 2: Количество просмотров и лайков для каждого поста
post_views_likes = data.groupby('post_id')['action'].value_counts().unstack().fillna(0)
post_views_likes.columns = ['post_views', 'post_likes']
data = data.merge(post_views_likes, on='post_id', how='left')

# Feature 3: Количество просмотров и лайков для каждой группы тематик
temp_df = data[['exp_group', 'topic', 'action']]

# Создание колонок с количеством просмотров и лайков для каждой темы внутри группы
topic_action_count = temp_df.pivot_table(index='exp_group', columns=['topic', 'action'], aggfunc=len, fill_value=0)
topic_action_count.columns = [f'{col[0]}_exp_group_{col[1]}s' for col in topic_action_count.columns]
grouped_data = topic_action_count.reset_index()

data = data.merge(grouped_data, on='exp_group', how='left')

# Преобразование категориальных признаков в строковый формат
categorical_columns = ['country', 'city', 'topic', 'gender', 'os', 'source']
data[categorical_columns] = data[categorical_columns].astype(str)

print('Additional features created')
print(f"Data shape after additional features creation: {data.shape}")

In [2]:
data.head()

NameError: name 'data' is not defined

In [None]:
# print all columns of the dataframe
print(data.columns)

#  Truncated Singular Value Decomposition (Truncated SVD), also known as Latent Semantic Analysis (LSA).

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
# Take only 'text' and 'target' columns
df = data[['text', 'target']]

In [None]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])

In [None]:
# Initialize SVD
n_components = 10
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_tfidf_matrix = svd.fit_transform(tfidf_matrix)

In [None]:
reduced_df = pd.DataFrame(reduced_tfidf_matrix, columns=[f'component_{i}' for i in range(1, n_components+1)])
reduced_df['target'] = df['target']


In [None]:
#save data to csv
import os

desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop_path, "SVD.csv")

reduced_df.to_csv(file_path, index=False)

# UPLOADING FEATURES TO DATABASE

In [24]:
#save data to csv
import os

desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop_path, "data.csv")

data.to_csv(file_path, index=False)


In [19]:
import math
from tqdm import tqdm

def upload_dataframe_in_chunks(data, table_name, engine, chunksize=10000):
    total_chunks = math.ceil(len(data) / chunksize)
    for i in tqdm(range(total_chunks), desc=f"Uploading to {table_name}"):
        chunk = data[i * chunksize : (i + 1) * chunksize]
        if_exists = "replace" if i == 0 else "append"
        chunk.to_sql(table_name, con=engine, if_exists=if_exists, index=False, method="multi")

Uploading to a-efimik_features_lesson_22:   0%|          | 3/769 [23:49<100:32:03, 472.48s/it]

In [None]:
chunksize = 100000
upload_dataframe_in_chunks(data, "a-efimik_features_lesson_22", engine, chunksize=chunksize)

In [6]:
df = pd.read_sql('SELECT * FROM public.yancharskaya_features_lesson_22 LIMIT 1000', con=engine) # считываем таблицу

In [8]:
import pandas as pd

size_query = '''
SELECT 
    pg_size_pretty(pg_total_relation_size('public.yancharskaya_features_lesson_22')) AS total_size,
    pg_size_pretty(pg_relation_size('public.yancharskaya_features_lesson_22')) AS table_size,
    pg_size_pretty(pg_total_relation_size('public.yancharskaya_features_lesson_22') - pg_relation_size('public.yancharskaya_features_lesson_22')) AS indexes_size
FROM
    information_schema.tables
WHERE
    table_schema='public' AND table_name='yancharskaya_features_lesson_22';
'''

size_df = pd.read_sql(size_query, con=engine)
print(size_df)


  total_size table_size indexes_size
0     359 MB     321 MB        38 MB


In [9]:
dimensions_query = '''
SELECT
    COUNT(*) AS row_count,
    (SELECT COUNT(*)
     FROM information_schema.columns
     WHERE table_schema = 'public'
     AND table_name = 'yancharskaya_features_lesson_22') AS column_count
FROM
    public.yancharskaya_features_lesson_22;
'''

dimensions_df = pd.read_sql(dimensions_query, con=engine)
print(dimensions_df)


   row_count  column_count
0    1768926            20


In [7]:
df.head()

Unnamed: 0,index,user_id,post_id,gender,age,country,city,exp_group,source,day_of_week,hour,text_size,iOS,covid,entertainment,movie,politics,sport,tech,top_words
0,1232208,157937,5416,1,19,0.132464,0.102506,2,0.139363,3,8,631,0,0,0,1,0,0,0,0
1,1232209,3192,1156,0,19,0.195424,0.184388,3,0.139631,3,8,3080,1,0,0,0,1,0,0,0
2,1232210,16804,7087,0,19,0.132464,0.079235,1,0.139631,3,8,993,1,0,0,1,0,0,0,0
3,1232211,37226,7147,1,38,0.132464,0.176157,2,0.139631,3,8,5500,0,0,0,1,0,0,0,1
4,1232212,137700,5049,1,28,0.132464,0.129808,0,0.139363,3,8,428,0,0,0,1,0,0,0,1


In [20]:
df.shape()

TypeError: 'tuple' object is not callable

# Подготовка данных для инференса

In [None]:
# Убираем ненужные столбцы
X = data.drop(['target', 'action', 'text'], axis=1)

categorical_columns = ['country', 'topic', 'city', 'gender', 'os', 'source']

# Создание ID группы на основе столбца 'user_id'
unique_user_ids = X['user_id'].unique()
group_id_dict = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
X['group_id'] = X['user_id'].map(group_id_dict)

# Сортировка набора данных для предсказаний по 'group_id'
X = X.sort_values(by='group_id')

# Убедитесь, что категориальные переменные представлены в виде строк
X[categorical_columns] = X[categorical_columns].astype(str)



# Запись фичей в базу данных

# То что будет в сервисе

In [None]:
from catboost import Pool

## TODO: надо просто передать лист с индексами категориальных признаков
# Получение индексов категориальных столбцов
cat_features = [X.drop(columns=['user_id']).columns.get_loc(col) for col in categorical_columns]

# Создание объекта Pool для набора данных предсказаний с колонкой 'group_id' и категориальными признаками
prediction_pool = Pool(X.drop(columns=['user_id']), cat_features=cat_features, group_id=X['group_id'])