In [1]:
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from catboost import Pool, CatBoostClassifier
import numpy as np
import re
from string import punctuation
from sklearn.metrics import roc_curve, auc

            +------------+     +---------------+     +-----------+
            | user_data  |     | post_text_df  |     | feed_data |
            +------------+     +---------------+     +-----------+
            | age        |     | id            |     | timestamp |
            | city       |     | text          |     | user_id   |
            | country    |     | topic         |     | post_id   |
            | exp_group  |     +---------------+     | action    |
            | gender     |           7,023           | target    |
            | id         |                           +-----------+
            | os         |                             76,892,800
            | source     |
            +------------+
                163,205 

In [2]:
user_data = pd.read_csv('C:\\Users\\Alex\\Desktop\\data\\user_data.csv')  # Загрузка данных пользователя
post_text_df = pd.read_csv('C:\\Users\\Alex\\Desktop\\data\\post_text_df.csv')  # Загрузка текстовых данных поста
feed_data = pd.read_csv('C:\\Users\\Alex\\Desktop\\data\\feed_data.csv')  # Загрузка данных кормления


In [3]:
# Переименование столбцов идентификаторов
user_data = user_data.rename(columns={'id': 'user_id'})
post_text_df = post_text_df.rename(columns={'id': 'post_id'})

# Формируем столбец "text_len"
post_text_df['text_len'] = post_text_df['text'].apply(len)

post_text_df = post_text_df.drop('text', axis=1)

user_data = user_data.drop(['country', 'city', 'exp_group', 'os', 'source', 'gender'], axis=1)

# Category encoding for topic colum
le = LabelEncoder()
post_text_df['topic_encoded'] = le.fit_transform(post_text_df['topic'])

# Объединение таблиц
data = feed_data.merge(user_data, on='user_id', how='left')
data = data.merge(post_text_df, on='post_id', how='left')


In [4]:
# Let's create a subset dataframe with only 'view' actions
views_df = data[data['action'] == 'view']

# Now, let's count views per post
views_per_post = views_df['post_id'].value_counts()

# Let's create a subset dataframe with only 'like' actions
likes_df = data[data['action'] == 'like']

# Now, let's count likes per post
likes_per_post = likes_df['post_id'].value_counts()

# Now, let's merge these two series into a new dataframe
post_stats = pd.DataFrame({
    'views': views_per_post,
    'likes': likes_per_post
})

# Replace NaN values with 0 (assuming that NaN means there were no likes/views)
post_stats.fillna(0, inplace=True)

# Let's calculate the likes percentage for each post from all likes
total_likes = post_stats['likes'].sum() # Total likes across all posts
post_stats['like_percentage'] = (post_stats['likes'] / total_likes) * 100

# Let's reset the index so 'post_id' becomes a column
post_stats.reset_index(inplace=True)
post_stats.rename(columns={'index': 'post_id'}, inplace=True)

# Now we merge this dataframe with the original one, on 'post_id'
# 'left' ensures that all rows in the original data are kept, even if they don't have a match in post_stats
data = pd.merge(data, post_stats, on='post_id', how='left')

In [5]:
data.head(2)

Unnamed: 0,timestamp,user_id,post_id,action,target,age,topic,text_len,topic_encoded,views,likes,like_percentage
0,2021-12-26 16:40:07,52431,6920,view,0,34,movie,1148,3,6610,717,0.008737
1,2021-12-26 16:42:42,52431,3840,view,0,34,covid,140,1,22496,2842,0.034632


In [6]:
data.isnull().sum()

timestamp          0
user_id            0
post_id            0
action             0
target             0
age                0
topic              0
text_len           0
topic_encoded      0
views              0
likes              0
like_percentage    0
dtype: int64

In [7]:
'''Calculating the number of likes for each user for each topic'''

# Filter DataFrame for 'like' action
df_likes = data[data['action'] == 'like']

# Generate the pivot table
pivot_df = pd.pivot_table(df_likes, values='action', index='user_id', columns='topic', aggfunc='count', fill_value=0)

# Reset the index to make user_id a column again
pivot_df.reset_index(inplace=True)

# Merge the original dataframe with the pivot dataframe
data = pd.merge(data, pivot_df, how='left', on='user_id')

data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,age,topic,text_len,topic_encoded,views,likes,like_percentage,business,covid,entertainment,movie,politics,sport,tech
0,2021-12-26 16:40:07,52431,6920,view,0,34,movie,1148,3,6610,717,0.008737,0.0,4.0,1.0,7.0,0.0,1.0,0.0
1,2021-12-26 16:42:42,52431,3840,view,0,34,covid,140,1,22496,2842,0.034632,0.0,4.0,1.0,7.0,0.0,1.0,0.0
2,2021-12-26 16:45:27,52431,1716,view,0,34,sport,2360,5,22072,2807,0.034205,0.0,4.0,1.0,7.0,0.0,1.0,0.0
3,2021-12-26 16:46:39,52431,1054,view,0,34,politics,1908,4,12219,871,0.010614,0.0,4.0,1.0,7.0,0.0,1.0,0.0
4,2021-12-26 16:48:54,52431,963,view,0,34,politics,2489,4,22136,2804,0.034169,0.0,4.0,1.0,7.0,0.0,1.0,0.0


In [8]:
data.fillna(0, inplace=True)

In [9]:
# Group by 'topic' and calculate mean likes per topic
topic_means_likes = data.groupby('topic')['likes'].mean()
                                          
# Convert this Series to a DataFrame and reset the index
topic_means_likes = topic_means_likes.reset_index()

# Rename the columns to something more understandable
topic_means_likes.columns = ['topic', 'average_likes']

data = data.drop('topic_encoded', axis=1)


In [10]:
topic_means_likes

Unnamed: 0,topic,average_likes
0,business,1216.625181
1,covid,1481.118654
2,entertainment,1200.533425
3,movie,1376.696927
4,politics,2196.697049
5,sport,2549.52247
6,tech,1014.975894


In [11]:
data = pd.merge(data, topic_means_likes, on='topic', how='left')

In [12]:
# Convert datetime to Unix timestamp (seconds since 1970-01-01 00:00:00 UTC)
data['timestamp'] = pd.to_datetime(data['timestamp'])

data['timestamp'] = (data['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [13]:
data.head(2)

Unnamed: 0,timestamp,user_id,post_id,action,target,age,topic,text_len,views,likes,like_percentage,business,covid,entertainment,movie,politics,sport,tech,average_likes
0,1640536807,52431,6920,view,0,34,movie,1148,6610,717,0.008737,0.0,4.0,1.0,7.0,0.0,1.0,0.0,1376.696927
1,1640536962,52431,3840,view,0,34,covid,140,22496,2842,0.034632,0.0,4.0,1.0,7.0,0.0,1.0,0.0,1481.118654


In [14]:
data = data.drop(['action', 'topic'], axis=1)

In [None]:
## Обрезаем дата сет
def sample_20_percent(group):
    frac = 0.05
    return group.sample(frac=frac)

data_10_percent = data.groupby('post_id', group_keys=False).apply(sample_20_percent)
data_10_percent.to_csv('data_5_percent.csv', sep=',', index=False)

In [None]:
data.to_csv('data_full_with_features.csv', sep=',', index=False)

In [16]:
## Обрезаем дата сет
def sample_20_percent(group):
    frac = 0.1
    return group.sample(frac=frac)

data_10_percent = data.groupby('post_id', group_keys=False).apply(sample_20_percent)
data_10_percent.to_csv('data_10_percent.csv', sep=',', index=False)



In [17]:
## Обрезаем дата сет
def sample_20_percent(group):
    frac = 0.15
    return group.sample(frac=frac)

data_10_percent = data.groupby('post_id', group_keys=False).apply(sample_20_percent)
data_10_percent.to_csv('data_15_percent.csv', sep=',', index=False)



In [18]:
## Обрезаем дата сет
def sample_20_percent(group):
    frac = 0.20
    return group.sample(frac=frac)

data_10_percent = data.groupby('post_id', group_keys=False).apply(sample_20_percent)
data_10_percent.to_csv('data_20_percent.csv', sep=',', index=False)