In [40]:
import pandas as pd
import numpy as np
import os
import sys
import pickle
import datetime
import time
import re
from sqlalchemy import create_engine
from sqlalchemy import text
import sqlalchemy
import psycopg2

In [5]:
TOKENIZERS_PARALLELISM=False

In [37]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GroupShuffleSplit
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

In [38]:
from collections import Counter

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import joblib

In [9]:
TOKENIZERS_PARALLELISM=False

In [22]:
STOP_WORDS = { "the",
    "and", "or", "the", "a", "an", "to", "of", "in", "on", "for", "with",
    "by", "at", "from", "as", "is", "are", "this", "that", 'tv', 'collection',
    'episodes' ,'episode', 'и', 'ну', 'не', 'ссср','shorts', 'y', 'kidsmania',
    'un', 'el', 'de', 'be'
}

In [23]:
#
def tokenize_clean(text):
    tokens = re.findall(r"\b[^\W_]+\b", text.lower(), flags=re.UNICODE)
    return [ t for t in tokens if not t.isdigit() and t not in STOP_WORDS ]

In [24]:
# get reels from sql and learn model

In [26]:
conn_string = os.environ['PG_DB2']
db = create_engine(conn_string.replace('multdb', 'youtube'))
conn = db.connect()

In [27]:
req1 = '''select iyr.yt_reel_id, reel_name, yt_ch_name, cartoon, brand_id, product_id,language from intl_yt_reels iyr 
join intl_yt_channels2reels iyrcr 
on iyr.yt_reel_id = iyrcr.yt_reel_id 
join intl_yt_channels iyc 
on iyrcr.yt_ch_id = iyc.yt_ch_id 
where iyc."language" in ('США','Английский', 'Испанский', 'Арабский', 'Португальский','Китайский')
and date < '2026-02-01' '''

In [28]:
df_in = pd.read_sql(text(req1), conn)
conn.close()

In [87]:
df_in['name'] = df_in.yt_ch_name + ' '+df_in.reel_name
df_in["tokens"] = df_in["name"].apply(tokenize_clean)

In [88]:
df_in['label'] = df_in.cartoon.apply(lambda x: int(x!= 'none')) 

In [89]:
texts = df_in['tokens'].str.join(' ').to_list() #df_in["tokens"].to_list()
labels = df_in['label'].to_list()

In [90]:
X_train, X_test, y_train, y_test = train_test_split( texts, labels, test_size=0.2, random_state=54321, stratify= labels)

In [93]:
def compute_smooth_weights(y, alpha=0.5):
    counter = Counter(y)
    N = len(y)
    weights = {cls: (N / count) ** alpha for cls, count in counter.items()}
    min_w = min(weights.values())
    weights = {k: v / min_w for k, v in weights.items()}
    return weights

class_weights_dict = compute_smooth_weights(y_train, alpha=0.5)

classes = sorted(np.unique(y_train))
class_weights = [class_weights_dict[c] for c in classes]

In [94]:
vectorizer = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),   # слова и пары слов
    min_df=2,
    max_df=0.95,
    max_features=50000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [97]:
model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="F1",
    use_best_model=True,
    early_stopping_rounds=100,
    random_seed=42,
    verbose=100
)

model.fit(
    X_train_vec,
    y_train,
    eval_set=(X_test_vec, y_test),
    use_best_model=True
)

0:	learn: 0.7800242	test: 0.7836735	best: 0.7836735 (0)	total: 108ms	remaining: 1m 48s
100:	learn: 0.9722222	test: 0.9697337	best: 0.9716012 (97)	total: 8.29s	remaining: 1m 13s
200:	learn: 0.9937925	test: 0.9905101	best: 0.9905101 (175)	total: 16.5s	remaining: 1m 5s
300:	learn: 0.9958702	test: 0.9917062	best: 0.9917062 (211)	total: 24.6s	remaining: 57.2s
400:	learn: 0.9992648	test: 0.9940898	best: 0.9940898 (316)	total: 32.8s	remaining: 49s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9940898345
bestIteration = 316

Shrink model to first 317 iterations.


<catboost.core.CatBoostClassifier at 0x7f5fa5684190>

In [98]:
pipeline = Pipeline([
    ("tfidf", vectorizer),
    ("model", model)
])

In [99]:
joblib.dump(pipeline, "pipeline_bin_new.pkl")

['pipeline_bin_new.pkl']

In [107]:
pipeline = joblib.load("pipeline_bin.pkl")


In [108]:
# get reels to categorise

In [109]:
conn_string = os.environ['PG_DB2']
db = create_engine(conn_string.replace('multdb', 'youtube'))
conn = db.connect()
req1 = '''select iyr.yt_reel_id, reel_name, yt_ch_name, cartoon, brand_id, product_id,language from intl_yt_reels iyr 
join intl_yt_channels2reels iyrcr 
on iyr.yt_reel_id = iyrcr.yt_reel_id 
join intl_yt_channels iyc 
on iyrcr.yt_ch_id = iyc.yt_ch_id 
where iyc."language" in ('США','Английский', 'Испанский', 'Арабский', 'Португальский','Китайский')
and date >= '2026-02-01' '''
df_sql = pd.read_sql(text(req1), conn)
conn.close()

In [110]:
df_sql['name'] = df_sql.yt_ch_name + ' '+df_sql.reel_name
df_sql["tokens"] = df_sql["name"].apply(tokenize_clean)

In [111]:
texts2 = df_sql['tokens'].str.join(' ').to_list()
labels2 = df_sql.cartoon.apply(lambda x: int(x!= 'none')) 


In [112]:
y_pred = pipeline.predict(texts2)

In [84]:
#pd.concat([df_sql, pd.DataFrame(y_pred)], axis = 1).to_excel('./result/temp_test.xlsx')

In [114]:
print(classification_report(labels2, y_pred))
print(confusion_matrix(labels2, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       579
           1       1.00      1.00      1.00        75

    accuracy                           1.00       654
   macro avg       1.00      1.00      1.00       654
weighted avg       1.00      1.00      1.00       654

[[579   0]
 [  0  75]]


In [115]:
result = pd.concat([df_sql, pd.DataFrame(y_pred)], axis =1)

In [116]:
result.to_excel('test_bin.xlsx', index = False)