In [1]:
import pandas as pd
import numpy as np
import math

from ydata_profiling import ProfileReport
from skimpy import skim

import sys
import time
import requests
from retrying import retry
from tqdm import tqdm
from datetime import datetime
from vininfo import Vin


from IPython.display import Markdown
import nbformat

import optuna
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import Pool, CatBoostRegressor, CatBoostClassifier, cv

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
# plt.style.use('default') 
sns.set_style("dark")
plt.style.use('dark_background') # я работаю в темной теме, поэтому использую эту настройку.
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

s = 42

In [6]:
metric = 'MAPE'

def activate_coders():

    ohe = ColumnTransformer([('encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
                                ('scaler', MinMaxScaler(), num_cols)],
                                    remainder='passthrough')

    oe = ColumnTransformer([('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols),
                                ('scaler', MinMaxScaler(), num_cols)],
                                remainder='passthrough')
    return ohe, oe 

pd.options.display.float_format = '{: .3f}'.format

results = pd.DataFrame(columns=[metric,'Время обучения','Время предсказания'])
results

def metrics(df,model,score,fit,pred,metric=metric):
    f'''Функция берёт на вход датафрейм, название модели, показаетль {metric} (по CV),
    время обучения и время предсказания. Добавляет строку с показателями модели.'''
    
    df.loc[model,'Время обучения'] = fit
    df.loc[model,'Время предсказания'] = pred
    df.loc[model, metric] = score
    
    return df

def pred(model):
    f"""Функция берёт на вход модель, обучает её на обучающей выборке,
    делает предсказания и проверяет {metric} на тестовой выборке.
    На выход идёт MAPE, время обучения и предсказания."""
    
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    time_fit = end - start

    start = time.time()
    preds = model.predict(X_test)
    end = time.time()
    score = mean_absolute_percentage_error(y_test, preds)
    time_pred = end - start
    
    print('Время обучения модели:', time_fit)
    print('Время предсказания модели:', time_pred)
    print(f'{metric}', score)
    
    return model, preds, score, time_fit, time_pred

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test = test.drop('saledate',axis=1)
train = train.drop('saledate',axis=1)

In [11]:
target = 'sellingprice'
cat_cols = ['make', 'model', 'trim', 'body','state','color','interior','seller','condition','transmission']
num_cols = list(train.columns.drop(cat_cols+['vin']+[target]))
features = num_cols+cat_cols

train[cat_cols] = train[cat_cols].fillna('unknown')
train[num_cols] = train[num_cols].fillna(train[num_cols].median())

test[cat_cols] = test[cat_cols].fillna('unknown')
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

train[cat_cols] = train[cat_cols].apply(lambda x: x.str.lower())
test[cat_cols] = test[cat_cols].apply(lambda x: x.str.lower())

In [13]:
for i in train[cat_cols]:

    value_counts = train[i].value_counts()

    rare_values = value_counts[value_counts < 16].index.tolist()

    train[i] = train[i].apply(lambda x: 'rare' if x in rare_values else x)

for i in test[cat_cols]:

    value_counts = test[i].value_counts()

    rare_values = value_counts[value_counts < 4].index.tolist()

    test[i] = test[i].apply(lambda x: 'rare' if x in rare_values else x)

In [14]:
df = train.copy()

X = df.drop([target]+['vin'], axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = s)

ohe,oe = activate_coders()

In [18]:
final = pd.DataFrame(columns=['Время обучения','Время предсказания',metric])

params = {
    'learning_rate':0.352,
    'depth': 10,
    'loss_function': 'MAE',
    'random_state': s,
    'early_stopping_rounds': 10,
    'verbose': 1000,
    'eval_metric': metric 
}

model = CatBoostRegressor(**params)

pipe = Pipeline([('ohe', ohe),
                 ('cbr', model)])
                 
model, preds, score, time_fit, time_pred = pred(pipe)

metrics(final, 'default_CatBoostRegressor', score, time_fit, time_pred)

0:	learn: 1.2430680	total: 250ms	remaining: 4m 9s
999:	learn: 0.1923250	total: 3m 29s	remaining: 0us
Время обучения модели: 214.9987132549286
Время предсказания модели: 1.8945105075836182
MAPE 0.16894733655434985


Unnamed: 0,Время обучения,Время предсказания,MAPE
default_CatBoostRegressor,214.999,1.895,0.169
