In [1]:
import pandas as pd
import os
import sys
import pickle
import datetime
import time
import re
from sqlalchemy import create_engine
from sqlalchemy import text
import sqlalchemy
import joblib
from collections import Counter
import numpy as np

In [5]:
TOKENIZERS_PARALLELISM=False
TF_ENABLE_ONEDNN_OPTS=0

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
STOP_WORDS = { "the",
    "and", "or", "the", "a", "an", "to", "of", "in", "on", "for", "with",
    "by", "at", "from", "as", "is", "are", "this", "that", 'tv', 'collection',
    'episodes' ,'episode', 'и', 'ну', 'не', 'ссср','shorts', 'y', 'kidsmania',
    'un', 'el', 'de', 'be'
}

In [8]:
def tokenize_clean(text):
    tokens = re.findall(r"\b[^\W_]+\b", text.lower(), flags=re.UNICODE)
    return [ t for t in tokens if not t.isdigit() and t not in STOP_WORDS ]

In [9]:
conn_string = os.environ['PG_DB2']
db = create_engine(conn_string.replace('multdb', 'youtube'))
conn = db.connect()

In [10]:
req1 = '''select iyr.yt_reel_id, reel_name, yt_ch_name, cartoon, brand_id, product_id,language from intl_yt_reels iyr 
join intl_yt_channels2reels iyrcr 
on iyr.yt_reel_id = iyrcr.yt_reel_id 
join intl_yt_channels iyc 
on iyrcr.yt_ch_id = iyc.yt_ch_id 
where iyc."language" in ('США','Английский', 'Испанский', 'Арабский', 'Португальский','Китайский')
and brand_id like 'BR%' 
and date < '2026-02-01' '''

In [11]:
df_in = pd.read_sql(text(req1), conn)
conn.close()

In [12]:
df_in['name'] = df_in.yt_ch_name + ' '+df_in.reel_name
df_in["tokens"] = df_in["name"].apply(tokenize_clean)

In [13]:
texts = df_in['tokens'].str.join(' ').to_list() #df_in["tokens"].to_list()
labels = df_in['brand_id'].to_list()

In [14]:
X_train, X_test, y_train, y_test = train_test_split( texts, labels, test_size=0.2, random_state=54321, stratify= labels)

In [15]:
def compute_smooth_weights(y, alpha=0.5):
    counter = Counter(y)
    N = len(y)
    weights = {cls: (N / count) ** alpha for cls, count in counter.items()}
    min_w = min(weights.values())
    weights = {k: v / min_w for k, v in weights.items()}
    return weights
class_weights_dict = compute_smooth_weights(y_train, alpha=0.5)
classes = sorted(np.unique(y_train))
class_weights = [class_weights_dict[c] for c in classes]

In [24]:
#print([round(i, 3) for i in class_weights])

In [22]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="word",
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        max_features=50000
    )),
    ("model", CatBoostClassifier(
        iterations=1000,
        depth=6,
        learning_rate=0.05,
        loss_function="MultiClass",
        verbose=False,
        class_weights=class_weights
    ))
])

pipeline.fit(X_train, y_train)


In [23]:
joblib.dump(pipeline, "pipeline_new.pkl")

['pipeline_new.pkl']