In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

In [69]:
df = pd.read_parquet("../data/test_final_dataset_2.snappy.parquet", engine="fastparquet")

In [70]:
df['image_similarity'] = df['image_similarity'].fillna(0)

df[['is_same_location', 'is_same_region']] = df[['is_same_location', 'is_same_region']].astype(int)

In [71]:
# Совпадение категорий и параметров
df['same_category'] = (df['base_category_name'] == df['cand_category_name']).astype(int)
df['same_subcategory'] = (df['base_subcategory_name'] == df['cand_subcategory_name']).astype(int)

In [72]:
df['price_diff_pct'] = 2 * abs(df['base_price'] - df['cand_price']) / (df['base_price'] + df['cand_price'] + 1e-6)

df['images_diff'] = df['base_count_images'] - df['cand_count_images']

In [73]:
numeric_cols = [
    'base_price', 
    'cand_price',
    'price_diff_pct',
    'base_count_images',
    'cand_count_images',
    'images_diff',
    'common_params_count',
    'same_values_count'
]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [74]:
df['base_param1'] = df['base_param1'].replace('', 'Не указано')
df['cand_param1'] = df['cand_param1'].replace('', 'Не указано')
df['same_param1'] = (df['base_param1'] == df['cand_param1']).astype(int)
le_param1 = LabelEncoder()
all_values = pd.concat([df['base_param1'], df['cand_param1']]).unique()
le_param1.fit(all_values)

df['base_param1_encoded'] = le_param1.transform(df['base_param1'])
df['cand_param1_encoded'] = le_param1.transform(df['cand_param1'])


In [75]:
df['base_param2'] = df['base_param2'].replace('', 'Не указано')
df['cand_param2'] = df['cand_param2'].replace('', 'Не указано')
df['same_param2'] = (df['base_param2'] == df['cand_param2']).astype(int)
le_param2 = LabelEncoder()
all_values = pd.concat([df['base_param2'], df['cand_param2']]).unique()
le_param2.fit(all_values)

df['base_param2_encoded'] = le_param2.transform(df['base_param2'])
df['cand_param2_encoded'] = le_param2.transform(df['cand_param2'])

In [76]:
columns_to_drop = [
    'base_title', 'cand_title', 'base_description', 'cand_description',
    'base_json_params', 'cand_json_params',
    'base_title_image', 'cand_title_image',
    'base_category_name', 'cand_category_name',
    'base_subcategory_name', 'cand_subcategory_name',
    'base_param1', 'cand_param1',
    'base_param2', 'cand_param2',
]

df = df.drop(columns=columns_to_drop)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 250000 to 499999
Data columns (total 24 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   base_item_id            250000 non-null  object 
 1   cand_item_id            250000 non-null  object 
 2   base_price              250000 non-null  float64
 3   cand_price              250000 non-null  float64
 4   base_count_images       250000 non-null  float64
 5   cand_count_images       250000 non-null  float64
 6   is_same_location        250000 non-null  int32  
 7   is_same_region          250000 non-null  int32  
 8   common_params_count     250000 non-null  float64
 9   same_values_count       250000 non-null  float64
 10  basic_image_similarity  250000 non-null  float64
 11  image_similarity        250000 non-null  float64
 12  title_similarity        250000 non-null  float64
 13  description_similarity  250000 non-null  float64
 14  same_category       

In [None]:
import joblib
import numpy as np

model_assets = joblib.load('../data/models/best_model_assets_2.joblib')
models = model_assets['models']
calibrator = model_assets['final_calibrator']
features = model_assets['features']
calibrators = model_assets['calibrators']

assert set(features).issubset(set(df.columns)), "В тестовых данных отсутствуют некоторые фичи"

X_test = df[features]

calibrated_probs = []
for model, calibrator in zip(models, calibrators):
    raw_pred = model.predict(X_test)
    calibrated_pred = calibrator.predict(raw_pred)
    calibrated_probs.append(calibrated_pred)

final_probability = np.mean(calibrated_probs, axis=0)

# test_preds = np.mean([model.predict(X_test) for model in models], axis=0)

# test_probabilities = calibrator.predict(test_preds)

results = pd.DataFrame({
    'base_id': df['base_item_id'],
    'cand_id': df['cand_item_id'],
    'probability': np.round(final_probability, 2)
})

# results.to_csv('submission.csv', index=False, float_format='%.2f')
results.to_csv('../submissions/submission.csv', mode='a', header=False, index=False, float_format='%.2f')

print("Результаты сохранены в submission.csv")
print(f"Всего пар: {len(results)}")
print(f"Средняя вероятность: {results['probability'].mean():.4f}")

Результаты сохранены в submission.csv
Всего пар: 250000
Средняя вероятность: 0.0488
