# DATA PREPARING

In [22]:
from datetime import datetime
import pandas as pd
import random
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import joblib

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
!pip install catboost
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans



In [24]:
"""
users = 163205
posts = 7023
views = 68686455
""";

In [25]:
database = 
user = 
password = 
host = 
port = 
CONNECTION = f"postgresql://{user}:{password}@{host}:{port}/{database}"

In [26]:
limit = 1000000

all_data = pd.read_sql(
    f"""
    SELECT * FROM public.feed_data feed
    INNER JOIN public.user_data user_data ON feed.user_id = user_data.user_id 
    WHERE feed.action != 'like'
    LIMIT {limit};
    """,
    con=CONNECTION
)

post_data = pd.read_sql(
    """
    SELECT * FROM public.post_text_df
    """,
    con=CONNECTION
)

user_data = pd.read_sql(
    """
    SELECT * FROM public.user_data
    """,
    con=CONNECTION
)

In [27]:
wnl = WordNetLemmatizer()

def text_processing(text, wnl=wnl):
    text = text.lower()
    text = ' '.join([wnl.lemmatize(x) for x in text.split(' ')])
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r"\s{2,}", ' ', text)
    
    return text

def clustering(post_data, tfidf_data):
    len_ = post_data.topic.nunique()
    pca = PCA(n_components=20)
    pca_decomp = pca.fit_transform(tfidf_data - tfidf_data.mean())

    kmeans = KMeans(n_clusters=len_, random_state=42).fit(pca_decomp)
    names = [f'cluster_{i}' for i in range(len_)]

    clusters = pd.DataFrame(
        data = kmeans.transform(pca_decomp),
        columns = names
    )
    
    return clusters

def post_processing(post_data):
    post_data.text = post_data.text.apply(text_processing)
    post_data['text_len'] = post_data.text.apply(lambda x: len(x))
    
    tfidf = TfidfVectorizer()
    tfidf_data = tfidf.fit_transform(post_data.text).toarray()
    
    clusters = clustering(post_data, tfidf_data)
    post_data = pd.concat((post_data, clusters), axis=1)
    post_data = post_data.drop('text', axis=1)
    
    return post_data

In [28]:
post_data = post_processing(post_data)
post_data.to_sql('my_post_data', con=CONNECTION, index=False, if_exists='replace')
        
post_data.head()

Unnamed: 0,post_id,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6
0,1,business,1852,0.41604,0.285502,0.489037,0.372144,0.463269,0.427753,0.394889
1,2,business,2525,0.359564,0.2082,0.458162,0.283852,0.415407,0.231213,0.319756
2,3,business,3122,0.328082,0.18605,0.419581,0.295639,0.382085,0.382382,0.307203
3,4,business,969,0.279087,0.146943,0.308911,0.26617,0.358457,0.365116,0.325018
4,5,business,837,0.211583,0.172529,0.204834,0.212319,0.323416,0.351736,0.316022


In [29]:
def data_proccessing(all_data, categorical):
    all_encoding = pd.DataFrame()
    
    for col in categorical:
        if all_data[col].nunique() < 5:
            result = pd.get_dummies(all_data[col], drop_first=True, prefix=col)
            all_data = pd.concat((all_data, result), axis=1)
            all_data = all_data.drop(col, axis=1)
        else:
            group = all_data[[col, 'target']].groupby([col]).agg('mean').to_dict()['target']
            
            encoding = pd.DataFrame(group.values(), group.keys(), columns=['value'])
            encoding['column'] = col
            all_encoding = pd.concat((all_encoding, encoding), axis=0)
            
            scale = np.mean(list(group.values())) / 10
            all_data[col] = all_data[col].map(group) + np.random.normal(size=len(all_data[col]), scale=scale)
            
    all_encoding.to_sql('all_encoding', con=CONNECTION, index=True, if_exists='replace')        
    
    return all_data

In [30]:
all_data['month'] = all_data['timestamp'].dt.month
all_data['weekday'] = all_data['timestamp'].dt.weekday
all_data['hour'] = all_data['timestamp'].dt.hour
all_data['minute'] = all_data['timestamp'].dt.minute

time_data = all_data[['month', 'weekday', 'hour', 'minute']]
time_data.to_sql('my_time_data', con=CONNECTION, index=True, if_exists='replace')        

user_id = all_data.user_id.iloc[:, 0]
all_data = all_data.drop(['action', 'timestamp', 'user_id'], axis=1)

In [31]:
all_data = pd.merge(all_data, post_data, on='post_id')
all_data = pd.concat((all_data, user_id), axis=1)
all_data.head()

Unnamed: 0,post_id,target,gender,age,country,city,exp_group,os,source,month,...,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,user_id
0,6945,0,1,18,Russia,Nizhneudinsk,3,iOS,ads,12,...,movie,1117,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804
1,6945,0,1,23,Russia,Saint Petersburg,0,Android,organic,11,...,movie,1117,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804
2,6945,0,0,14,Russia,Moscow,1,iOS,organic,11,...,movie,1117,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804
3,6945,0,0,14,Russia,Moscow,1,iOS,organic,12,...,movie,1117,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804
4,6945,0,0,21,Russia,Belgorod,1,Android,organic,12,...,movie,1117,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804


In [32]:
categorical = ['country', 'city', 'exp_group', 'os', 'source', 'topic']
all_data = data_proccessing(all_data, categorical)
all_data.head()

Unnamed: 0,post_id,target,gender,age,country,city,exp_group,month,weekday,hour,...,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,user_id,os_iOS,source_organic
0,6945,0,1,18,0.08976,0.114677,0.09364,12,3,10,...,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804,1,0
1,6945,0,1,23,0.129612,0.161479,0.102299,11,3,17,...,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804,0,1
2,6945,0,0,14,0.113874,0.163763,0.147776,11,2,14,...,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804,1,1
3,6945,0,0,14,0.136626,0.161027,0.146363,12,6,20,...,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804,1,1
4,6945,0,0,21,0.1265,0.084384,0.139922,12,6,21,...,0.245891,0.282806,0.413108,0.278902,0.298565,0.376027,0.199577,65804,0,1


In [33]:
all_encoding = pd.read_sql(
    """
    SELECT * FROM public.all_encoding
    """, con=CONNECTION
)

In [34]:
def update_data(all_data, categorical):

    for col in categorical:
        all_data.loc[:, col] = all_data.loc[:, col].map(all_encoding[all_encoding['column'] == col]\
                                .set_index('index').to_dict()['value'])
        value = all_data[col].mode()[0]
        all_data[col] = all_data[col].fillna(value)
        
    return all_data

In [35]:
categorical = ['country', 'city', 'exp_group']
user_data['exp_group'] = user_data['exp_group'].astype(str)
user_data['os_iOS'] = (user_data['os'] == 'iOS').astype(int)
user_data['source_organic'] = (user_data['source'] == 'organic').astype(int)
user_data = user_data.drop(['os', 'source'], axis=1)
user_data = update_data(user_data, categorical)

In [36]:
all_encoding

Unnamed: 0,index,value,column
0,Azerbaijan,0.084878,country
1,Belarus,0.189608,country
2,Cyprus,0.099432,country
3,Estonia,0.113736,country
4,Finland,0.146704,country
...,...,...,...
763,entertainment,0.120932,topic
764,movie,0.122471,topic
765,politics,0.113373,topic
766,sport,0.124304,topic


In [37]:
categorical = ['topic']
post_data = update_data(post_data, categorical)

In [38]:
# CENTERING DATA

user_data.iloc[:, 1:] = StandardScaler().fit_transform(user_data.iloc[:, 1:])
post_data.iloc[:, 1:] = StandardScaler().fit_transform(post_data.iloc[:, 1:])

In [39]:
user_data.to_sql('my_user_data', con=CONNECTION, index=False, if_exists='replace')
post_data.to_sql('my_post_data', con=CONNECTION, index=False, if_exists='replace')

23

In [40]:
user_data.head(2)

Unnamed: 0,user_id,gender,age,country,city,exp_group,os_iOS,source_organic
0,200,0.902104,0.664568,-0.329883,0.698471,-1.077545,-0.734899,-0.777821
1,201,-1.108519,0.957562,-0.329883,0.885821,-1.039159,-0.734899,-0.777821


In [41]:
post_data.head(2)

Unnamed: 0,post_id,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6
0,1,-0.218296,0.520822,1.766706,-0.30501,1.228945,0.921282,1.511576,0.171627,0.79118
1,2,-0.218296,1.07816,1.073711,-1.295447,1.001209,-0.274666,0.975423,-2.346733,0.02171


In [42]:
all_data = all_data.drop(['post_id', 'user_id'], axis=1)
all_data.iloc[:, 1:] = StandardScaler().fit_transform(all_data.iloc[:, 1:])

In [43]:
all_data.head(2)

Unnamed: 0,target,gender,age,country,city,exp_group,month,weekday,hour,minute,...,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,os_iOS,source_organic
0,0,0.906544,-0.895064,-1.154062,-0.11882,-0.950865,1.251898,0.003896,-0.888751,-0.838438,...,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,-0.490357
1,0,0.906544,-0.415498,0.346883,0.853076,-0.643327,0.025755,0.003896,0.529871,0.665887,...,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,-0.730583,2.03933


# Preparing Data for Random Person

In [44]:
id_ = 2024
time = datetime(2021,11,11)

In [45]:
post_data.head(1)

Unnamed: 0,post_id,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6
0,1,-0.218296,0.520822,1.766706,-0.30501,1.228945,0.921282,1.511576,0.171627,0.79118


In [46]:
df_user = pd.read_sql(
    f"""
    SELECT * FROM public.my_user_data u
    WHERE u.user_id = {id_}
    """,
    con=CONNECTION
).drop('user_id', axis=1)

In [47]:
df_user['month'] = time.month
df_user['weekday'] = time.weekday()
df_user['hour'] = time.hour
df_user['minute'] = time.minute

In [48]:
df_user = pd.concat([df_user] * len(post_data))
df_user = df_user.reset_index().drop('index', axis=1)
df_user = pd.concat((df_user, post_data), axis=1)
df_user = df_user.set_index('post_id')

In [49]:
df_user = df_user[['gender', 'age', 'country', 'city', 
                   'exp_group', 'month', 'weekday',
                   'hour', 'minute', 'topic', 'text_len', 
                   'cluster_0', 'cluster_1', 'cluster_2', 
                   'cluster_3', 'cluster_4', 'cluster_5', 
                   'cluster_6', 'os_iOS', 'source_organic'
                  ]]

In [50]:
df_user.head(2)

Unnamed: 0_level_0,gender,age,country,city,exp_group,month,weekday,hour,minute,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,os_iOS,source_organic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,-1.108519,0.762233,-0.329883,0.698471,-1.077545,11,3,0,0,-0.218296,0.520822,1.766706,-0.30501,1.228945,0.921282,1.511576,0.171627,0.79118,1.360731,-0.777821
2,-1.108519,0.762233,-0.329883,0.698471,-1.077545,11,3,0,0,-0.218296,1.07816,1.073711,-1.295447,1.001209,-0.274666,0.975423,-2.346733,0.02171,1.360731,-0.777821


In [51]:
df_time = pd.read_sql(
    f"""
    SELECT * FROM public.my_time_data
    LIMIT 5000;
    """,
    con=CONNECTION
)

In [52]:
df_time.head(2)

Unnamed: 0,index,month,weekday,hour,minute
0,0,11,1,22,7
1,1,11,1,22,7


In [53]:
user_time = pd.DataFrame.from_dict(
    {'index': [len(df_time)], 
     'month': [time.month], 
     'weekday': [time.weekday()], 
     'hour': [time.hour], 
     'minute': [time.minute]}
)

In [54]:
df_time = pd.concat((df_time, user_time), axis=0)
df_time = StandardScaler().fit_transform(df_time.iloc[:, 1:])

In [55]:
df_user[['month', 'weekday', 'hour', 'minute']] = df_time[-1]

In [56]:
df_user.head()

Unnamed: 0_level_0,gender,age,country,city,exp_group,month,weekday,hour,minute,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,os_iOS,source_organic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,0.520822,1.766706,-0.30501,1.228945,0.921282,1.511576,0.171627,0.79118,1.360731,-0.777821
2,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,1.07816,1.073711,-1.295447,1.001209,-0.274666,0.975423,-2.346733,0.02171,1.360731,-0.777821
3,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,1.572559,0.687403,-1.579254,0.716623,-0.115016,0.602152,-0.409742,-0.106848,1.360731,-0.777821
4,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,-0.210426,0.086207,-2.080318,-0.099701,-0.514185,0.337472,-0.630977,0.075599,1.360731,-0.777821
5,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,-0.31974,-0.742106,-1.752488,-0.867388,-1.24362,-0.055057,-0.802418,-0.016528,1.360731,-0.777821


# MODEL TRAINING

In [57]:
X = all_data.drop(['target'], axis=1)
y = all_data['target']

In [58]:
X.head()

Unnamed: 0,gender,age,country,city,exp_group,month,weekday,hour,minute,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,os_iOS,source_organic
0,0.906544,-0.895064,-1.154062,-0.11882,-0.950865,1.251898,0.003896,-0.888751,-0.838438,-1.202894,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,-0.490357
1,0.906544,-0.415498,0.346883,0.853076,-0.643327,0.025755,0.003896,0.529871,0.665887,0.135868,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,-0.730583,2.03933
2,-1.10309,-1.278717,-0.245865,0.900497,0.971965,0.025755,-0.492353,-0.07811,0.145159,1.405127,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,2.03933
3,-1.10309,-1.278717,0.61108,0.843682,0.921768,1.251898,1.492642,1.137852,-1.127731,1.627725,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,2.03933
4,-1.10309,-0.607324,0.229698,-0.747899,0.692993,1.251898,1.492642,1.340513,0.955181,2.343881,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,-0.730583,2.03933


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [65]:
model1 = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
model2 = CatBoostClassifier(random_state=42, depth=4, iterations=1000, verbose=0)

pipe1_grid = {
    'C': [0.001, 0.01, 0.5, 1, 2, 5, 10, 30],
}

pipe2_grid = {
    'depth': [2, 4, 6],
    'iterations': [100, 300],
    'l2_leaf_reg': [5, 10, 15],
}

pipes_grid = [pipe1_grid, pipe2_grid]
names = ['Logistic Regression', 'CatBoost']

In [66]:
def model_train(model, X_train, y_train, X_test, y_test, grid=None):
    model_default = model
    model_default.fit(X_train, y_train)
    predict = model_default.predict_proba(X_test)[:, 1]
    print(f'Standart model ROC_AUC = {roc_auc_score(y_test, predict)}')

    model_grid = model
    
    clf = GridSearchCV(
    model_grid,
    grid,
    scoring='roc_auc',
    cv=5
)
    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_
    clf_predict = best_model.predict_proba(X_test)[:, 1]
    clf_predict_train = best_model.predict_proba(X_train)[:, 1]
    print(f'Best model ROC_AUC = {roc_auc_score(y_test, clf_predict)}')
    print(f'Best model ROC_AUC train = {roc_auc_score(y_train, clf_predict_train)}')
    
    return [model, best_model]

In [67]:
all_default_models = []
all_best_models = []

for id, model in enumerate([model1, model2]):
    print(f'\t{names[id]}')
    model, best_model = model_train(model, X_train, y_train, X_test, y_test, pipes_grid[id])
    
    all_default_models.append(model)
    all_best_models.append(best_model)
    print()

	Logistic Regression
Standart model ROC_AUC = 0.6407371298139298
Best model ROC_AUC = 0.640752606271174
Best model ROC_AUC train = 0.6408918925955729

	CatBoost
Standart model ROC_AUC = 0.6760477532224514
Best model ROC_AUC = 0.6664723933755052
Best model ROC_AUC train = 0.6694857568562579



# SAVE MODEL

In [69]:
joblib.dump(all_default_models[0], 'LR.pkl')
joblib.dump(all_default_models[1], 'CB.pkl')

['CB.pkl']

In [70]:
joblib.dump(all_best_models[0], 'LR_best.pkl')
joblib.dump(all_best_models[1], 'CB_best.pkl')

['CB_best.pkl']

# TEST MODEL

In [71]:
df_user.head()

Unnamed: 0_level_0,gender,age,country,city,exp_group,month,weekday,hour,minute,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,os_iOS,source_organic
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,0.520822,1.766706,-0.30501,1.228945,0.921282,1.511576,0.171627,0.79118,1.360731,-0.777821
2,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,1.07816,1.073711,-1.295447,1.001209,-0.274666,0.975423,-2.346733,0.02171,1.360731,-0.777821
3,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,1.572559,0.687403,-1.579254,0.716623,-0.115016,0.602152,-0.409742,-0.106848,1.360731,-0.777821
4,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,-0.210426,0.086207,-2.080318,-0.099701,-0.514185,0.337472,-0.630977,0.075599,1.360731,-0.777821
5,-1.108519,0.762233,-0.329883,0.698471,-1.077545,0.093568,-0.162437,-2.766369,-1.69577,-0.218296,-0.31974,-0.742106,-1.752488,-0.867388,-1.24362,-0.055057,-0.802418,-0.016528,1.360731,-0.777821


In [72]:
model = joblib.load('CB_best.pkl')

In [73]:
preds = model.predict_proba(df_user)[:,1]

In [74]:
top_of_preds = pd.DataFrame(preds, df_user.index, columns=['value']).sort_values(by='value', ascending=False)
top_of_preds.head()

Unnamed: 0_level_0,value
post_id,Unnamed: 1_level_1
6770,0.234834
6696,0.233384
5387,0.233348
7191,0.233158
5656,0.232625


In [79]:
limit = 5
result = top_of_preds[:limit].index.values
result

array([6770, 6696, 5387, 7191, 5656], dtype=int64)

In [80]:
default_posts_data = pd.read_sql(
    """
    SELECT * FROM public.post_text_df
    """,
    con=CONNECTION
).rename({'post_id': 'id'}, axis=1)

In [81]:
all_result = []

for i in result:
    all_result.append(
        default_posts_data[default_posts_data.id == i]\
    .iloc[0,:]\
    .to_dict()
    )

In [82]:
all_result

[{'id': 6770,
  'text': 'Really bad. Why anyone thinks this is a good film let alone funny is a true mystery. I like comedies as much as the next man and I LOVED A Christmas Story. The fact that it has the same director and was based on the same writers memoirs has me completely puzzled as to why this film is such a complete failure on every level. Charles Grodin is woefully miscast as the father for starters. For another it does not seem to have the same pacing -- it just doesnt flow well. Everything seems tired and forced. The joy of life that permeated the first film is completely absent here -- you just want the movie to end. I wouldnt even recommend this movie for curiosity-seekers who enjoyed A Christmas Story. Its that bad. 1/10.',
  'topic': 'movie'},
 {'id': 6696,
  'text': 'I have seen romantic comedies and this is one of the easiest/worst attempts at one. A lot of the scenes work in a plug-and-play manner inserted strictly to conform to the romantic-comedy genre. Usually thi