In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None) 

## Загрузка данных

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
train_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,560,,2,59,3,0,30.0,1.0,5,0,1,0,0,0,0,0,0,0,0,0,0,0,4510000
1,1,2011-1,667,,10,50,2,1,25.0,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,13231000
2,2,2011-1,90,0.0,1,48,2,0,25.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,2008000
3,3,2011-1,94,1.0,3,62,3,1,30.0,,3,0,1,0,0,0,0,0,0,0,0,0,0,0,12680000
4,4,2011-1,232,0.0,3,60,3,0,25.0,,3,0,1,0,0,0,0,0,0,0,0,0,0,0,3335000


In [5]:
test_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,100000,2012-3,459,,1,60,3,1,30.0,0.0,4,0,0,0,0,0,0,0,0,0,0,0,0,0
1,100001,2012-3,344,1.0,10,52,2,1,,,4,0,0,0,0,0,0,0,0,0,0,0,0,0
2,100002,2012-3,585,0.0,4,54,3,0,30.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,100003,2012-3,494,,2,52,2,1,25.0,1.0,3,0,1,0,0,0,0,0,0,0,0,0,0,0
4,100004,2012-3,622,1.0,9,60,3,1,15.0,,1,1,1,0,0,0,0,0,0,0,0,0,0,0


## Предобработка данных

In [6]:
for df in [train_df, test_df]:
    df[['year', 'month']] = df['date'].str.split('-', expand=True)
    df.drop(columns=['id', 'date'], inplace=True)
    
    df['build_tech'] = df['build_tech'].fillna(df['build_tech'].mode()[0])
    df['g_lift'] = df['g_lift'].fillna(0)
    df['metro_dist'] = df['metro_dist'].fillna(df['metro_dist'].median())

In [7]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
from category_encoders import TargetEncoder

train_df['street_id'] = train_df['street_id'].astype('category') 
categorical_cols = ['street_id']

encoder = TargetEncoder(cols=categorical_cols, return_df=False)
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols], train_df['price'])
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols])

In [9]:
from sklearn.preprocessing import MinMaxScaler

numerical_cols = ['floor', 'area', 'rooms', 'balcon', 'metro_dist', 'n_photos']
scaler = MinMaxScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])  # Use the same scaler

In [10]:
train_df.head()

Unnamed: 0,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price,year,month
0,4350127.0,0.0,0.041667,0.159574,0.4,0.0,1.0,1.0,0.454545,0,1,0,0,0,0,0,0,0,0,0,0,0,4510000,2011,1
1,11245650.0,0.0,0.375,0.111702,0.2,0.5,0.833333,0.0,0.090909,0,1,0,0,0,0,0,0,0,0,0,0,0,13231000,2011,1
2,3753764.0,0.0,0.0,0.101064,0.2,0.0,0.833333,0.0,0.090909,0,1,0,0,0,0,0,0,0,0,0,0,0,2008000,2011,1
3,16580230.0,1.0,0.083333,0.175532,0.4,0.5,1.0,0.0,0.272727,0,1,0,0,0,0,0,0,0,0,0,0,0,12680000,2011,1
4,5494460.0,0.0,0.083333,0.164894,0.4,0.0,0.833333,0.0,0.272727,0,1,0,0,0,0,0,0,0,0,0,0,0,3335000,2011,1


In [11]:
test_df.head()

Unnamed: 0,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,year,month
0,5670880.0,0.0,0.0,0.164894,0.4,0.5,1.0,0.0,0.363636,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,3
1,3244847.0,1.0,0.375,0.12234,0.2,0.5,0.833333,0.0,0.363636,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,3
2,7211916.0,0.0,0.125,0.132979,0.4,0.0,1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,3
3,2370973.0,0.0,0.041667,0.12234,0.2,0.5,0.833333,1.0,0.272727,0,1,0,0,0,0,0,0,0,0,0,0,0,2012,3
4,3729612.0,1.0,0.333333,0.164894,0.4,0.5,0.5,0.0,0.090909,1,1,0,0,0,0,0,0,0,0,0,0,0,2012,3


## Применение модели

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

features = train_df.columns.drop('price')
target = 'price'

X = train_df[features]
y = train_df[target]

gb_regressor = GradientBoostingRegressor(random_state=42)

param_dist = {
    'n_estimators': randint(50, 150),
    'learning_rate': uniform(0.01, 0.5),
    'max_depth': randint(3, 7)
}

gb_model = RandomizedSearchCV(estimator=gb_regressor, param_distributions=param_dist, n_iter=5, cv=5, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)

gb_model.fit(X, y)

print("Best Parameters:", gb_model.best_params_)

print("Best MAE:", -gb_model.best_score_)

Best Parameters: {'learning_rate': 0.08800932022121825, 'max_depth': 5, 'n_estimators': 136}
Best MAE: 907592.1497674318


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

params = {'learning_rate': 0.08800932022121825, 'max_depth': 5, 'n_estimators': 136}

features = train_df.columns.drop('price')
target = 'price'

X = train_df[features]
y = train_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(random_state=42, **params)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)


MAE: 900406.2412255178


In [14]:
predictions = model.predict(test_df)

result_df = pd.DataFrame({'id': test_df.index + 100000, 'price': predictions})

result_df.to_csv('predictions.csv', index=False)