In [66]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor, CatBoostClassifier
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import BaggingRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

In [67]:
VERSION    = 11
VAL_SIZE   = 0.25   # 33%
N_FOLDS    = 5
RANDOM_SEED = 42

# CATBOOST
ITERATIONS = 2000
LR         = 0.05

In [68]:
def fill_owners(row):   
    if row['owners'] == np.NaN:
        if row.mileage < 78500:
            owners = 1
        elif row.mileage < 133000:
            owners = 2
        else:
            owners = 3
        return owners
    else:
        return re.findall(r'\d', row['owners'])[0]

In [69]:
def fill_owners_test(row):
    
    if row['Владельцы'] == np.NaN:
        if row.mileage < 78500:
            owners = 1
        elif row.mileage < 133000:
            owners = 2
        else:
            owners = 3
        return owners
    else:
        return re.findall(r'\d', row['Владельцы'])[0]

In [70]:
def preproc_test(df):
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 
             'Комплектация',
              'Руль','Состояние', 'Таможня', 'Владение', 'id'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(
        lambda x: 0 if x.split(' ')[0] == 'undefined' else int(float(x.split(' ')[0])*10))
    df['mileage'] = df.mileage.apply(lambda x: int(x))
    df['modelDate'] = df.modelDate.apply(lambda x: int(x))
    df['numberOfDoors'] = df.numberOfDoors.apply(lambda x: int(x))
    df['Привод'] = df['Привод'].apply(lambda x: x.lower())
    df['Владельцы'] = df.apply(fill_owners_test, axis=1)
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['productionDate'] = df['productionDate'].apply(lambda x: int(x))
    return df

def preproc_data(df):
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 
             'equipment',
             'name_full',
              'wheel','state', 'customs', 'owningTime', 'Unnamed: 0'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(
        lambda x: 0 if x.split(' ')[0] == 'undefined' else int(float(x.split(' ')[0])*10))
    df['mileage'] = df.mileage.apply(lambda x: int(re.sub(r'[^\d]', '', x)))
    df['drive'] = df.drive.apply(lambda x: x.lower())
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['owners'] = df.apply(fill_owners, axis=1)
    return df

def preproc_test_feature_generation(df):
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 'Комплектация',
              'Руль','Состояние', 'Таможня', 'Владение', 'id', 
             'color', 'mileage', 'ПТС', 'Владельцы'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(lambda x: x.split(' ')[0])
    df['modelDate'] = df.modelDate.apply(lambda x: int(x))
    df['numberOfDoors'] = df.numberOfDoors .apply(lambda x: int(x))
    df['Привод'] = df['Привод'].apply(lambda x: x.lower())
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['productionDate'] = df['productionDate'].apply(lambda x: int(x))
    temp = df.columns.to_list()
    temp[-1] = 'drive'
    df.columns=temp
    return df

In [71]:
cwd = os.getcwd()
df = pd.read_csv(cwd + r'\data\auto_data_x_v2.csv')
dft = pd.read_csv(cwd + r'\data\test.csv')
dft_f = pd.read_csv(cwd + r'\data\test.csv')
from_file = CatBoostClassifier()
from_file.load_model('model_feature_generator_v2')
sample_submission = pd.read_csv(cwd + r'\data\sample_submission.csv')

In [72]:
X_sub = preproc_test(dft)
X = preproc_data(df)
X_for_feature = preproc_test_feature_generation(dft_f)
model = from_file.predict(X_for_feature)
X_sub['model'] = model

In [73]:
#5 modelDate
X['modelDate2'] = 2020 - X.modelDate
X_sub['modelDate2'] = 2020 - X_sub.modelDate
# 9 productionDate
X['productionDate2'] = 2020 - X.productionDate
X_sub['productionDate2'] = 2020 - X_sub.productionDate

In [74]:
y = X.loc[:, 'price']
X.drop('price', axis=1, inplace=True)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

In [76]:
X_sub.columns = X.columns

In [77]:
cat_features = ['bodyType', 'color', 'fuelType', 'name', 'vehicleTransmission', 'engineDisplacement', 
                'drive', 'pts', 'owners', 'model']

In [78]:
def cat_model(y_train, X_train, X_test, y_test):
    model = CatBoostRegressor(iterations = ITERATIONS,
                              learning_rate = LR,
                              eval_metric='MAPE',
                              random_seed = RANDOM_SEED,)
    model.fit(X_train, y_train,
              cat_features=cat_features,
              eval_set=(X_test, y_test),
              verbose=False,
              use_best_model=True,
              plot=False)
    
    return(model)


def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

## Запуск CatBoost

In [79]:
submissions = pd.DataFrame(0,columns=["sub_1"], index=sample_submission.index) # куда пишем предикты по каждой модели
score_ls = []
splits = list(KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED).split(X, y))

for idx, (train_idx, test_idx) in tqdm(enumerate(splits), total=N_FOLDS,):
    # use the indexes to extract the folds in the train and validation data
    X_train, y_train, X_test, y_test = X.iloc[train_idx], y[train_idx], X.iloc[test_idx], y[test_idx]
    # model for this fold
    model = cat_model(y_train, X_train, X_test, y_test,)
    # score model on test
    test_predict = model.predict(X_test)
    test_score = mape(y_test, test_predict)
    score_ls.append(test_score)
    print(f"{idx+1} Fold Test MAPE: {mape(y_test, test_predict):0.3f}")
    # submissions
    submissions[f'sub_{idx+1}'] = model.predict(X_sub)
    model.save_model(f'catboost_fold_{idx+1}.model')
    
print(f'Mean Score: {np.mean(score_ls):0.3f}')
print(f'Std Score: {np.std(score_ls):0.4f}')
print(f'Max Score: {np.max(score_ls):0.3f}')
print(f'Min Score: {np.min(score_ls):0.3f}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

1 Fold Test MAPE: 0.121
2 Fold Test MAPE: 0.122
3 Fold Test MAPE: 0.128
4 Fold Test MAPE: 0.124
5 Fold Test MAPE: 0.123

Mean Score: 0.124
Std Score: 0.0024
Max Score: 0.128
Min Score: 0.121


In [80]:
submissions.head(10)

Unnamed: 0,sub_1,sub_2,sub_3,sub_4,sub_5
0,1849861.0,1860583.0,1845055.0,1874074.0,1775689.0
1,2663122.0,2756455.0,2823938.0,2682350.0,2476429.0
2,1313474.0,1409003.0,1357749.0,1215429.0,1269715.0
3,2612071.0,2841871.0,2700843.0,2730885.0,2712774.0
4,5490663.0,5693784.0,5594006.0,5389972.0,5212935.0
5,2072644.0,2108735.0,2131787.0,2197398.0,2132819.0
6,1130101.0,1191317.0,1172653.0,1160804.0,1208791.0
7,706928.4,747979.2,758040.3,771383.0,738952.7
8,1521839.0,1608971.0,1505706.0,1556859.0,1534490.0
9,1424995.0,1497176.0,1490885.0,1482974.0,1436169.0


In [81]:
submissions['blend_round'] = np.round(((submissions.sum(axis=1))/len(submissions.columns))/1000*0.925)*1000
sample_submission['price'] = submissions['blend_round'].values
sample_submission.to_csv(f'submission_blend_v{VERSION}_round.csv', index=False)
sample_submission.head(10)

Unnamed: 0,id,price
0,0,1703000.0
1,1,2479000.0
2,2,1215000.0
3,3,2516000.0
4,4,5066000.0
5,5,1969000.0
6,6,1085000.0
7,7,689000.0
8,8,1430000.0
9,9,1356000.0


In [36]:
df.equipment

0       {'Прочее': ['Защита картера'], 'Салон': ['Тони...
1                                                      {}
2       {'Прочее': ['Защита картера'], 'Комфорт': ['Кр...
3                                                      {}
4                                                      {}
                              ...                        
7461                                                   {}
7462    {'Комфорт': ['Круиз-контроль', 'Мультифункцион...
7463    {'Комфорт': ['Круиз-контроль', 'Система «старт...
7464    {'Комфорт': ['Круиз-контроль', 'Мультифункцион...
7465    {'Комфорт': ['Круиз-контроль', 'Мультифункцион...
Name: equipment, Length: 7466, dtype: object

In [63]:
submissions['sub_2']
sample_submission['price'] = np.round(((submissions['sub_2']/1000*0.925)))*1000
sample_submission.to_csv(f'submission_blend_v{VERSION}_best_sub.csv', index=False)

In [62]:
sample_submission['price'] 

0       1727000.0
1       2517000.0
2       1286000.0
3       2534000.0
4       5237000.0
          ...    
3832    1028000.0
3833    2459000.0
3834     500000.0
3835     986000.0
3836    1141000.0
Name: price, Length: 3837, dtype: float64