In [1]:
import pandas as pd

In [2]:
from params import engine

dfs = {}

tables = ['user_data', 'post_text_df']

for table in tables:
    dfs[table] = pd.read_sql(f'SELECT * FROM {table}', engine)

dfs['feed_data'] = pd.read_sql('SELECT * FROM feed_data LIMIT 2500000', engine)

In [3]:
category = {}

category['user_data'] = dfs['user_data'].select_dtypes(include='object').columns
category['post_text_df'] = dfs['post_text_df'].select_dtypes(include='object').columns

In [4]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

for table in tables:
    for col in category[table]:
        if col == 'text': # обработка колонки "текст"
            continue

        if len(dfs[table][col].unique()) <= 5:
            one_hot = pd.get_dummies(dfs[table][col], prefix=col, drop_first=True).astype(int)
            dfs[table] = pd.concat((dfs[table].drop(col, axis=1), one_hot), axis=1)
        else:
            dfs[table][col] = labelencoder.fit_transform(dfs[table][col])

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

tfidf = TfidfVectorizer(max_features=5000)

tfidf_matrix = tfidf.fit_transform(dfs['post_text_df']['text'])

pca = PCA(n_components=15)

pca_result = pca.fit_transform(tfidf_matrix.toarray())

# Создание DataFrame из результата PCA
pca_df = pd.DataFrame(
    data=pca_result,
    columns=[f'PC{i+1}' for i in range(pca_result.shape[1])],
    index=dfs['post_text_df'].index
)

# Объединение DataFrame с исходными данными
dfs['post_text_df'] = pd.concat([dfs['post_text_df'].drop('text', axis=1), pca_df], axis=1)


In [6]:
f_df = pd.merge(dfs['post_text_df'],
                dfs['feed_data'],
                on='post_id',
                how='inner')

In [7]:
df = pd.merge(f_df,
              dfs['user_data'],
              on='user_id',
              how='inner')
df.head(3)

Unnamed: 0,post_id,topic,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,user_id,action,target,gender,age,country,city,exp_group,os_iOS,source_organic
0,1,0,-0.126184,-0.235953,0.036757,-0.088046,-0.13795,0.001762,-0.032423,0.04753,...,65566,view,0,1,21,7,1806,2,1,0
1,2370,1,0.269737,-0.015615,0.10811,-0.100946,-0.025815,0.14955,0.218116,-0.101648,...,65566,view,0,1,21,7,1806,2,1,0
2,2376,1,0.279955,0.012463,-0.050199,-0.027545,-0.022602,-0.09094,-0.066211,-0.029931,...,65566,view,1,1,21,7,1806,2,1,0


In [8]:
df = df[df['action'] != 'like']  # Удаляем строки, где action == 'like'

df = df.drop('action', axis=1) # ненужная колонка

In [9]:
# df['timestamp'] = pd.to_datetime(df['timestamp'])

# df['month'] = df['timestamp'].dt.month
# df['day'] = df['timestamp'].dt.day
# df['hour'] = df['timestamp'].dt.hour
# df['minute'] = df['timestamp'].dt.minute

# timestamp = df['timestamp']
df = df.drop('timestamp', axis=1)

In [10]:
df.head(5)

Unnamed: 0,post_id,topic,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC15,user_id,target,gender,age,country,city,exp_group,os_iOS,source_organic
0,1,0,-0.126184,-0.235953,0.036757,-0.088046,-0.13795,0.001762,-0.032423,0.04753,...,0.125956,65566,0,1,21,7,1806,2,1,0
1,2370,1,0.269737,-0.015615,0.10811,-0.100946,-0.025815,0.14955,0.218116,-0.101648,...,-0.039025,65566,0,1,21,7,1806,2,1,0
2,2376,1,0.279955,0.012463,-0.050199,-0.027545,-0.022602,-0.09094,-0.066211,-0.029931,...,-0.024126,65566,1,1,21,7,1806,2,1,0
4,54,0,-0.001027,-0.138294,-0.075782,-0.092835,-0.087551,-0.015689,-0.013412,0.012524,...,0.035853,65566,0,1,21,7,1806,2,1,0
5,2386,1,0.334904,-0.036378,0.023209,-0.066104,0.007943,0.015516,-0.055503,-0.090571,...,0.032329,65566,0,1,21,7,1806,2,1,0


In [11]:
from sklearn.model_selection import train_test_split

X = df.drop(['post_id', 'user_id', 'target'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=33,
                                                    shuffle=True)

In [12]:
# Пример распределения классов
class_counts = y_train.value_counts()
print(class_counts)

# Отношение классов
total = len(y_train)
class_ratios = {cls: count / total for cls, count in class_counts.items()}
print("Распределение классов:", class_ratios)

0    1377273
1     186380
Name: target, dtype: int64
Распределение классов: {0: 0.8808047565540437, 1: 0.11919524344595636}


In [13]:
class_weights = [total / count for count in class_counts]

print("Веса классов:", class_weights)

Веса классов: [1.135325385744148, 8.389596523232106]


In [14]:
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe_catboost = Pipeline([('scaler', StandardScaler()),
                 ('catboost', CatBoostClassifier(max_depth=10,
                                                 iterations=1100,
                                                 l2_leaf_reg=3,
                                                 learning_rate=0.03,
                                                 class_weights=class_weights,
                                                 eval_metric='Precision',
                                                 early_stopping_rounds=100,
                                                 verbose=10))])

pipe_catboost.fit(X_train, y_train)

0:	learn: 0.5663743	total: 335ms	remaining: 6m 8s


KeyboardInterrupt: 

In [15]:
from sklearn.metrics import classification_report


y_test_probs = pipe_catboost.predict_proba(X_test)[:, 1]
y_train_probs = pipe_catboost.predict_proba(X_train)[:, 1]

threshold = 0.58
y_test_preds_catboost = (y_test_probs >= threshold).astype(int)
y_train_preds_catboost = (y_train_probs >= threshold).astype(int)

print(classification_report(y_test, y_test_preds_catboost))
print(classification_report(y_train, y_train_preds_catboost))

              precision    recall  f1-score   support

           0       0.91      0.79      0.85    590578
           1       0.21      0.41      0.28     79788

    accuracy                           0.75    670366
   macro avg       0.56      0.60      0.56    670366
weighted avg       0.83      0.75      0.78    670366

              precision    recall  f1-score   support

           0       0.92      0.80      0.85   1378523
           1       0.23      0.46      0.31    185663

    accuracy                           0.76   1564186
   macro avg       0.58      0.63      0.58   1564186
weighted avg       0.84      0.76      0.79   1564186



In [16]:
import os
import pickle

filename = 'catboost_model.pkl'

pickle.dump(pipe_catboost, open(filename, 'wb'))

def get_model_path(path: str) -> str:
    if (
        os.environ.get("IS_LMS") == "1"
    ):  # проверяем где выполняется код в лмс, или локально. Немного магии
        MODEL_PATH = "/workdir/user_input/model"
    else:
        MODEL_PATH = path
    return MODEL_PATH


def load_models():
    model_path = get_model_path("catboost_model.pkl")
    model = pickle.load(open(model_path, 'rb'))

    return model

model = load_models()