In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error

In [14]:
train_df = pd.read_csv('data/train.csv').drop(columns=['id'])
test_df = pd.read_csv('data/test.csv')
test_id = test_df.id
test_df = test_df.drop(columns=['id'])

# ---------------------- 1. Предобработка данных ----------------------
# Масштабирование числовых признаков
numeric_cols = train_df.select_dtypes(include=[np.float64]).columns.tolist()
numeric_cols.remove('cost')  # Удаляем целевую переменную

scaler = StandardScaler()
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])

# Добавляем шум для Feature Jittering
noise = np.random.normal(0, 0.01, train_df[numeric_cols].shape)
train_df[numeric_cols] += noise

In [3]:
# Расчет границ IQR
Q1 = train_df.quantile(0.25)
Q3 = train_df.quantile(0.75)
IQR = Q3 - Q1

# Удаление выбросов
train_df = train_df[~((train_df < (Q1 - 1.5 * IQR)) | (train_df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [4]:
X = train_df.drop(columns=['cost'])
y = train_df.cost

In [5]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
poly.fit(X)
poly_features = poly.transform(X)
X = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(X.columns))

X

Unnamed: 0,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,...,video_store^2,video_store salad_bar,video_store prepared_food,video_store florist,salad_bar^2,salad_bar prepared_food,salad_bar florist,prepared_food^2,prepared_food florist,florist^2
0,0.685381,-0.043671,-0.305107,1.091242,-0.181803,-0.769718,0.874363,-0.708329,1.267562,1.407800,...,0.384505,0.624375,0.622788,0.631952,1.013886,1.011308,1.026190,1.008737,1.023581,1.038642
1,-0.395910,-1.351937,1.046040,-0.565912,0.723569,-1.539277,0.883433,-0.688448,-1.755410,-0.003494,...,0.407001,0.654973,0.643667,0.640145,1.054026,1.035833,1.030163,1.017953,1.012382,1.006841
2,2.373041,1.220831,-1.648327,-0.574249,0.739921,1.606972,0.870286,-0.702800,0.682314,-1.165255,...,0.387278,0.624249,0.644114,0.633893,1.006221,1.038241,1.021765,1.071280,1.054280,1.037549
3,-0.697055,-0.065736,1.685589,-0.568005,-2.044011,0.221801,-1.148249,1.432266,1.670003,-1.190983,...,0.376212,0.606841,0.612397,0.613478,0.978855,0.987817,0.989561,0.996861,0.998620,1.000383
4,-1.277546,-0.070770,1.712097,-0.572105,0.721971,0.680279,0.878732,1.425742,0.112205,-0.084836,...,2.553996,1.616009,1.567592,1.576773,1.022509,0.991874,0.997683,0.962157,0.967792,0.973460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312074,-1.052133,-1.325930,1.707167,-0.589048,-0.182947,-1.220356,-1.151180,1.434453,0.295225,-0.760855,...,0.363532,0.602740,0.614892,0.606888,0.999348,1.019497,1.006226,1.040052,1.026513,1.013150
312075,2.455838,1.239418,1.031594,-0.566937,1.667096,1.081987,0.868505,1.435444,-0.069384,-1.326320,...,0.381946,0.617971,0.634937,0.625954,0.999848,1.027299,1.012764,1.055503,1.040570,1.025848
312076,1.317856,-0.053797,-1.664521,-0.564802,-0.183865,-0.543235,0.878976,-0.696049,1.569062,0.400922,...,2.604500,1.579428,1.594718,1.629968,0.957801,0.967073,0.988450,0.976435,0.998019,1.020079
312077,1.422226,-0.074524,-0.974298,-0.558317,0.749856,-0.797434,-1.148172,1.436164,-0.479905,0.392002,...,2.553441,1.570996,1.585662,1.595290,0.966551,0.975574,0.981497,0.984681,0.990660,0.996675


In [6]:
# Разделим на обучающую и тестовую выборку
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# CatBoost с регуляризацией
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    l2_leaf_reg=10,  # L2-регуляризация
    bagging_temperature=1.0,  # Случайность выборки
    random_strength=2.0,  # Случайный шум
    depth=6,  # Глубина деревьев
    min_data_in_leaf=10,  # Минимум объектов в листе
    verbose=200,
    random_state=42
)

cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

0:	learn: 29.9163992	test: 29.9776711	best: 29.9776711 (0)	total: 96.6ms	remaining: 1m 36s
200:	learn: 28.5597081	test: 28.7500499	best: 28.7500499 (200)	total: 4.93s	remaining: 19.6s
400:	learn: 28.2791864	test: 28.5532962	best: 28.5532293 (399)	total: 8.94s	remaining: 13.3s
600:	learn: 28.0984671	test: 28.4855141	best: 28.4854815 (599)	total: 12.8s	remaining: 8.52s
800:	learn: 27.9547942	test: 28.4607444	best: 28.4607444 (800)	total: 16.7s	remaining: 4.16s
999:	learn: 27.8214563	test: 28.4413954	best: 28.4409551 (985)	total: 20.7s	remaining: 0us

bestTest = 28.44095511
bestIteration = 985

Shrink model to first 986 iterations.


<catboost.core.CatBoostRegressor at 0x1379967d0>

In [7]:
estimators = [
    ('ridge', Ridge(alpha=0.5)),
    ('cat', cat_model)
]

stacking_model = StackingRegressor(
    estimators=estimators, final_estimator=Ridge(alpha=1.0)
)
stacking_model.fit(X_train, y_train)

# Оценка модели
y_pred = stacking_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error на валидации: {mse}")

0:	learn: 29.9163992	total: 27ms	remaining: 27s
200:	learn: 28.5597081	total: 4.63s	remaining: 18.4s
400:	learn: 28.2791864	total: 9.08s	remaining: 13.6s
600:	learn: 28.0984671	total: 13.6s	remaining: 9.03s
800:	learn: 27.9547942	total: 17.9s	remaining: 4.44s
999:	learn: 27.8214563	total: 22.3s	remaining: 0us
0:	learn: 29.9057289	total: 24.8ms	remaining: 24.8s
200:	learn: 28.5886143	total: 4.47s	remaining: 17.8s
400:	learn: 28.2704179	total: 10s	remaining: 15s
600:	learn: 28.0631427	total: 13.8s	remaining: 9.18s
800:	learn: 27.8879010	total: 17.5s	remaining: 4.35s
999:	learn: 27.7305599	total: 21.2s	remaining: 0us
0:	learn: 29.9329721	total: 26.3ms	remaining: 26.3s
200:	learn: 28.6053618	total: 4.82s	remaining: 19.1s
400:	learn: 28.2775004	total: 9.26s	remaining: 13.8s
600:	learn: 28.0778082	total: 13.2s	remaining: 8.78s
800:	learn: 27.9032606	total: 17.1s	remaining: 4.24s
999:	learn: 27.7435249	total: 20.7s	remaining: 0us
0:	learn: 29.9034788	total: 39.1ms	remaining: 39.1s
200:	learn:

In [22]:
# ---------------------- 5. Pseudo-labeling ----------------------
# Генерация псевдо-меток для тестового набора
test_df = pd.read_csv('data/test.csv')
test_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(test_df.columns))

test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])
poly_features = poly.transform(test_df)
test_df

ValueError: input_features is not equal to feature_names_in_

In [18]:
pseudo_labels = stacking_model.predict(test_features)



ValueError: X has 15 features, but Ridge is expecting 135 features as input.

In [17]:
pseudo_train = pd.concat([X, pd.DataFrame(test_features)], axis=0)
pseudo_labels = np.concatenate([y, pseudo_labels])
cat_model.fit(pseudo_train, pseudo_labels)

NameError: name 'pseudo_labels' is not defined