Конкатенация всех эмбеддингов и обучение модели

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df_train_with_text = pd.read_csv('train.csv', index_col=0)
image_embeddings = pd.read_csv("test_image_embeddings.csv")

In [None]:
df_train_with_text["ItemID"] = df_train_with_text["ItemID"].astype(int)
image_embeddings["id"] = image_embeddings["id"].astype(int)

# Объединяем по совпадающим идентификаторам (left join)
df_merged = df_train_with_text.merge(
    image_embeddings[["id", "embedding"]],
    left_on="ItemID",
    right_on="id",
    how="left"
)

# Задаем размерность эмбеддинга (если неизвестно, можно получить из первой непустой записи)
emb_dim = 768  
default_embedding = [0.0] * emb_dim

# Заполняем отсутствующие эмбеддинги дефолтным вектором
df_merged["embedding"] = df_merged["embedding"].apply(
    lambda x: x if isinstance(x, list) else default_embedding
)

# Разворачиваем список эмбеддингов в отдельные столбцы
emb_df = pd.DataFrame(df_merged["embedding"].tolist(), index=df_merged.index)
emb_df.columns = [f"emb_{i}" for i in range(emb_df.shape[1])]

# Объединяем исходный DataFrame с эмбеддингами и удаляем лишние столбцы
df_final = pd.concat([df_merged.drop(columns=["embedding", "id"]), emb_df], axis=1)

# Сохраняем итоговый DataFrame
df_final.to_csv("df_final.csv", index=False)

print(df_final.head())

In [None]:
import pandas as pd
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score
import numpy as np
import json

# Разбиение данных
X = df_final.drop('resolution', axis=1)
y = df_final['resolution']

# Определение категориальных признаков
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Категориальные признаки: {cat_features}")

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'max_bin': trial.suggest_int('max_bin', 32, 128),
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 1, 4),
        'auto_class_weights': 'Balanced',
        'thread_count': -1,
        'task_type': 'CPU',
        'eval_metric': 'F1',
        'custom_metric': 'F1',
        'random_state': 42,
        'verbose': 0
    }
    
    # Кросс-валидация
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_f1_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_val_fold = y.iloc[val_idx]
        
        train_pool = Pool(X_train_fold, y_train_fold, cat_features=cat_features)
        val_pool = Pool(X_val_fold, y_val_fold, cat_features=cat_features)
        
        model = CatBoostClassifier(**params)
        model.fit(
            train_pool, 
            eval_set=val_pool, 
            early_stopping_rounds=50, 
            verbose=0
        )
        
        val_pred = model.predict(val_pool)
        fold_f1 = f1_score(y_val_fold, val_pred, average='binary')
        cv_f1_scores.append(fold_f1)
    
    return np.mean(cv_f1_scores)

print("Optuna params...")
study = optuna.create_study(direction='maximize', study_name='catboost_cv_optimization')
study.optimize(objective, n_trials=20)

# BestParams
print("\n" + "="*50)
print("Best params:", study.best_params)
print(f"Best f1: {study.best_value:.4f}")
print("="*50)

best_params = study.best_params
with open("best_params_cv.json", "w", encoding="utf-8") as f:
    json.dump(best_params, f, ensure_ascii=False, indent=4)


print("\nFinal test...")
final_model = CatBoostClassifier(
    **best_params,
    auto_class_weights='Balanced',
    thread_count=-1,
    task_type='CPU',
    eval_metric='F1',
    random_state=42,
    verbose=50
)

full_train_pool = Pool(X, y, cat_features=cat_features)
final_model.fit(full_train_pool)

feature_importances = final_model.get_feature_importance()
importance_df = pd.DataFrame({
    'feature': X.columns, 
    'importance': feature_importances
})

print("\nTop-20 Importance:")
print(importance_df.sort_values(by='importance', ascending=False).head(20))

final_model.save_model('best_catboost_model.cbm')


In [None]:
#Без валидации
import json

X = df_final.drop('resolution', axis=1)
y = df_final['resolution']

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report

cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical features: {cat_features}")


with open("best_params.json", "r", encoding="utf-8") as f:
    best_params = json.load(f)

model = CatBoostClassifier(
    **best_params,
    auto_class_weights='Balanced',
    thread_count=-1,
    task_type='CPU',
    eval_metric='F1',
    random_state=42,
    verbose=50
)

train_pool = Pool(X, y, cat_features=cat_features)
model.fit(train_pool)
train_pred = model.predict(X)
train_accuracy = accuracy_score(y, train_pred)

print(f"Training accuracy: {train_accuracy:.4f}")
print("Classification report on training data:")
print(classification_report(y, train_pred))

In [None]:
df_test_processed = pd.read_csv("test_proceeded.csv", index_col=0)

y_pred = model.predict(df_test_processed)

submission = pd.DataFrame({
    'id': test.id,
    'prediction': y_pred
})

submission.to_csv('submission.csv', index=False)


print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
