In [5]:
import pandas as pd
import numpy as np
import sys, os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor, CatBoostClassifier
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

In [6]:
VERSION    = 77
VAL_SIZE   = 0.33   # 33%
N_FOLDS    = 5
RANDOM_SEED = 42

# CATBOOST
ITERATIONS = 2000
LR         = 0.05

In [7]:
# Заполняем количество владельцев в зависимости от пробега
def fill_owners(row):   
    if row['owners'] == np.NaN:
        if row.mileage < 78500:
            owners = 1
        elif row.mileage < 133000:
            owners = 2
        else:
            owners = 3
        return owners
    else:
        return re.findall(r'\d', row['owners'])[0]

def fill_owners_test(row):
    
    if row['Владельцы'] == np.NaN:
        if row.mileage < 78500:
            owners = 1
        elif row.mileage < 133000:
            owners = 2
        else:
            owners = 3
        return owners
    else:
        return re.findall(r'\d', row['Владельцы'])[0]

In [8]:
def preproc_number_only(X):
    X_num = X.copy(deep=True)
    le = LabelEncoder()
    le.fit(X_num['model'])
    X_num['model_code'] = le.transform(X_num['model'])
    X_num.drop(['name'],axis = 1, inplace = True)
    X_num = pd.get_dummies(X_num, columns=['bodyType', 'color', 'fuelType', 'vehicleTransmission', 
                               'drive', 'owners', 'pts', 'model'])
    return X_num

def preproc_test(df_t):
    df = df_t.copy(deep=True)
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 'Комплектация',
              'Руль','Состояние', 'Таможня', 'Владение', 'id'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(
        lambda x: 0 if x.split(' ')[0] == 'undefined' else int(float(x.split(' ')[0])*10))
    df['mileage'] = df.mileage.apply(lambda x: int(x))
    df['modelDate'] = df.modelDate.apply(lambda x: int(x))
    df['numberOfDoors'] = df.numberOfDoors.apply(lambda x: int(x))
    df['Привод'] = df['Привод'].apply(lambda x: x.lower())
    df['Владельцы'] = df.apply(fill_owners_test, axis=1)
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['productionDate'] = df['productionDate'].apply(lambda x: int(x))
    return df

def preproc_data(df_d):
    df = df_d.copy(deep=True)
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 'equipment', 'name_full',
              'wheel','state', 'customs', 'owningTime', 'Unnamed: 0'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(
        lambda x: 0 if x.split(' ')[0] == 'undefined' else int(float(x.split(' ')[0])*10))
    df['mileage'] = df.mileage.apply(lambda x: int(re.sub(r'[^\d]', '', x)))
    df['drive'] = df.drive.apply(lambda x: x.lower())
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['owners'] = df.apply(fill_owners, axis=1)
    return df

def preproc_test_feature_generation(dft):
    df = dft.copy(deep=True)
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 'Комплектация',
              'Руль','Состояние', 'Таможня', 'Владение', 'id', 
             'color', 'mileage', 'ПТС', 'Владельцы'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(lambda x: x.split(' ')[0])
    df['modelDate'] = df.modelDate.apply(lambda x: int(x))
    df['numberOfDoors'] = df.numberOfDoors .apply(lambda x: int(x))
    df['Привод'] = df['Привод'].apply(lambda x: x.lower())
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['productionDate'] = df['productionDate'].apply(lambda x: int(x))
    temp = df.columns.to_list()
    temp[-1] = 'drive'
    df.columns=temp
    return df

In [9]:
cwd = os.getcwd()

In [10]:
df = pd.read_csv(cwd + r'\data\auto_data_x.csv')
df_test_file = pd.read_csv(cwd + r'\data\test.csv')
from_file = CatBoostClassifier()
from_file.load_model(cwd + r'\data\model_feature_generator')
sample_submission = pd.read_csv(cwd + r'\data\sample_submission.csv')

In [11]:
# Добавление признака в тестовую выборку путем предсказания на предварительно обученной модели
X_sub = preproc_test(df_test_file)
X = preproc_data(df)
X_for_feature = preproc_test_feature_generation(df_test_file)
model = from_file.predict(X_for_feature)
X_sub['model'] = model

In [12]:
X_sub.columns = ['bodyType','color','fuelType','modelDate','name','numberOfDoors',
                 'productionDate','vehicleTransmission','engineDisplacement',
                 'enginePower', 'mileage', 'drive', 'owners', 'pts', 'model']

In [13]:
# Переводим год изготовления и год модели в возраст, улучшает результат

#5 modelDate
X['modelDate'] = 2020 - X.modelDate
X_sub['modelDate'] = 2020 - X_sub.modelDate
# 9 productionDate
X['productionDate'] = 2020 - X.productionDate
X_sub['productionDate'] = 2020 - X_sub.productionDate

In [14]:
y = X.loc[:, 'price']
X.drop('price', axis=1, inplace=True)

In [15]:
X_num = preproc_number_only(X)
X_num['fuelType_электро'] = 0
X_sub_num = preproc_number_only(X_sub)

In [None]:
cat_features = ['bodyType', 'color', 'fuelType', 'name', 'vehicleTransmission', 'engineDisplacement', 
                'drive', 'pts', 'owners', 'model']

# Stacking

In [None]:
def compute_meta_feature_cat(clf, X_train, X_test, y_train, cv):
    
    X_meta_train = np.zeros_like(y_train, dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train.iloc[train_fold_index], X_train.iloc[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clf.copy()
        folded_clf.fit(X_fold_train, y_fold_train, cat_features=cat_features, verbose=False)
        X_meta_train[predict_fold_index] = folded_clf.predict(X_fold_predict)

    meta_clf = clf.copy()
    meta_clf.fit(X_train, y_train, cat_features=cat_features, verbose=False)
    X_meta_test = meta_clf.predict(X_test)

    return X_meta_train, X_meta_test
    
def compute_meta_feature(clf, X_train_num, X_test_num, y_train, cv):
        
    X_meta_train = np.zeros_like(y_train, dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train_num):
        X_fold_train, X_fold_predict = X_train_num.iloc[train_fold_index], X_train_num.iloc[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict(X_fold_predict)
        
    meta_clf = clone(clf)
    meta_clf.fit(X_train_num, y_train)
    X_meta_test = meta_clf.predict(X_test_num)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    
    features = [
        compute_meta_feature(clf, X_train_num, X_test_num, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    features.append(compute_meta_feature_cat(cb, X_train, X_test, y_train, cv))
    
    stacked_features_train = np.vstack([
        features_train for features_train, features_test in features
    ]).T

    stacked_features_test = np.vstack([
        features_test for features_train, features_test in features
    ]).T
    
    return stacked_features_train, stacked_features_test

In [None]:
cb = CatBoostRegressor(iterations = ITERATIONS,
                              learning_rate = LR,
                              eval_metric='MAPE',
                              random_seed = RANDOM_SEED)
rfr = RandomForestRegressor(n_estimators=50, criterion='mae',
                           random_state=RANDOM_SEED)
br = BaggingRegressor(ExtraTreesRegressor(n_estimators=100, 
                                          random_state=RANDOM_SEED))
                                          
clf_list = [rfr, br]

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
features = []
for clf in tqdm(clf_list):
    features.append(compute_meta_feature(clf, X_num, X_sub_num, y.values, cv))
features.append(compute_meta_feature_cat(cb, X, X_sub, y.values, cv))

In [None]:
stacked_features_train = np.stack(
    [features_train for features_train, features_test in features],axis=-1)

stacked_features_test = np.stack(
    [features_test for features_train, features_test in features],axis=-1)

In [None]:
final_model = LinearRegression()
final_model.fit(stacked_features_train, y)
sample_submission['price'] = np.floor((final_model.predict(stacked_features_test)) / 10000) * 10000 
sample_submission.to_csv(f'submission_stack_v{VERSION}_BMW.csv', index=False)
sample_submission.head(10)

In [None]:
Стекингом не удалось улучшить результат одиночного CatBoost.
На подбор параметров не хватило времени.