In [1]:
import torch
import torch.nn as nn

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor, Pool
from catboost.utils import get_roc_curve

import matplotlib.pyplot as plt
import sys
import os
sys.path.append(os.path.abspath("/home/arta/pythonProjects/rus_car_plate_price_pred/"))
from supplemental_english import GOVERNMENT_CODES

In [3]:
df = pd.read_csv("/home/arta/pythonProjects/rus_car_plate_price_pred/train.csv")
df['year'] = df['date'].apply(lambda x: int(x[:4])-2021)
df['num'] = df['plate'].apply(lambda x: x[1:4])
df['char'] = df['plate'].apply(lambda x: x[0]+x[4:6])
df['num'] = df['plate'].apply(lambda x: x[1:4])
df['reg'] = df['plate'].apply(lambda x: x[6:] if len(x)==9 else '0'+x[6:])
df.head()

Unnamed: 0,id,plate,date,price,year,num,char,reg
0,1,X059CP797,2024-12-26 00:00:00,65000,3,59,XCP,797
1,2,Y800MH790,2024-07-12 21:31:37,100000,3,800,YMH,790
2,3,A212TX77,2024-04-18 00:00:00,290000,3,212,ATX,77
3,4,P001AY199,2025-01-03 00:27:15,680000,4,1,PAY,199
4,5,P001AY199,2025-01-10 09:32:41,750000,4,1,PAY,199


In [7]:
num_to_prs = {}
reg_to_prs = {}
char_to_prs = {}

for num, char, reg, price in zip(df.num, df.char, df.reg, df.price):
    if num in num_to_prs:
        num_to_prs[num].append(price)
    else:
        num_to_prs[num] = [price]
    if reg in reg_to_prs:
        reg_to_prs[reg].append(price)
    else:
        reg_to_prs[reg] = [price] 
    if char in char_to_prs:
        char_to_prs[char].append(price)
    else:
        char_to_prs[char] = [price]

num_to_mid = {num:sorted(arr)[len(arr)//2] for num, arr in zip(num_to_prs.keys(), num_to_prs.values())}
reg_to_mid = {reg:sorted(arr)[len(arr)//2] for reg, arr in zip(reg_to_prs.keys(), reg_to_prs.values())}
char_to_mid = {char:sorted(arr)[len(arr)//2] for char, arr in zip(char_to_prs.keys(), char_to_prs.values())}

num_to_avg = {num:sum(arr)/len(arr) for num, arr in zip(num_to_prs.keys(), num_to_prs.values())}
reg_to_avg = {reg:sum(arr)/len(arr) for reg, arr in zip(reg_to_prs.keys(), reg_to_prs.values())}
char_to_avg = {char:sum(arr)/len(arr) for char, arr in zip(char_to_prs.keys(), char_to_prs.values())}

num_to_min = {num:min(arr) for num, arr in zip(num_to_prs.keys(), num_to_prs.values())}
reg_to_min = {reg:min(arr) for reg, arr in zip(reg_to_prs.keys(), reg_to_prs.values())}
char_to_min = {char:min(arr) for char, arr in zip(char_to_prs.keys(), char_to_prs.values())}

num_to_max = {num:max(arr) for num, arr in zip(num_to_prs.keys(), num_to_prs.values())}
reg_to_max = {reg:max(arr) for reg, arr in zip(reg_to_prs.keys(), reg_to_prs.values())}
char_to_max = {char:max(arr) for char, arr in zip(char_to_prs.keys(), char_to_prs.values())}

In [78]:
print(sorted(num_to_avg.items(), key=lambda x: x[1])[-100:])

[('661', 498500.0), ('744', 500636.36363636365), ('199', 506869.5565217391), ('628', 509375.0), ('807', 509944.44444444444), ('479', 513000.0), ('287', 515000.0), ('139', 517500.0), ('557', 519999.8888888889), ('419', 520555.55555555556), ('178', 520599.96), ('219', 523333.3333333333), ('256', 525666.6666666666), ('451', 528529.4117647059), ('501', 529725.0), ('592', 530000.0), ('775', 530451.6129032258), ('579', 533846.1538461539), ('716', 534555.5), ('431', 535357.1428571428), ('850', 536212.0909090909), ('948', 537000.0), ('063', 537066.5), ('444', 542580.6960526316), ('762', 547500.0), ('623', 554000.0), ('976', 560000.0), ('963', 565555.5555555555), ('083', 576095.5348837209), ('222', 576384.0612516644), ('390', 580250.0), ('883', 580560.0), ('491', 581875.0), ('417', 584545.4545454546), ('804', 592999.75), ('922', 600000.0), ('991', 602706.8793103448), ('182', 603750.0), ('379', 607727.2727272727), ('792', 608750.0), ('743', 612699.9), ('468', 614333.3333333334), ('339', 617857.1

In [88]:
ch_to_i = {ch : i for i, ch in enumerate("ABEKMHOPCTYX")}
FULL_ALPH = "ABEKMHOPCTYX0123456789"
ch2idx = {ch:i for i, ch in enumerate(FULL_ALPH)}
max_price = max(df.price)

def one_hot(x, dim=12):
    res = [0]*dim
    res[ch_to_i[x]] = 1
    return res

def get_supplemental(plate):
    num = int(plate[1:4])
    char = plate[0] + plate[4] + plate[5]
    reg = plate[6:]
    supplemental = [0, 0, 0]
    for sup in GOVERNMENT_CODES:
        if char == sup[0] and reg == sup[2] and sup[1][0] <= num <= sup[1][1]:
            supplemental = GOVERNMENT_CODES[sup][1:]
    return supplemental

nice_nums = ['001', '007', '100', '777', '123', '101', '808', '999', '888', '555', '111']
nice_chars = ['AMP', 'MMM', 'BOP', 'AYE', 'KMP', 'AAA', 'OOO', 'XMP', 'CEO']
big_reg = ["077", "097", "099", "177", "197", "199", "777", "797", "799", "977", #moscow
           "078", "098", "178", "198" #spb
          ]

def encode_plate(plate):
    if len(plate)!=9:
        plate = plate[:6] + "0" + plate[6:]
    num = plate[1:4]
    ch1 = plate[0]
    ch2 = plate[4]
    ch3 = plate[5]
    reg = plate[6:]
    encoded_X = [int(n) for n in num] #digits
    encoded_X+= [int(n) for n in reg] #region
    encoded_X+= one_hot(ch1) #char 1
    encoded_X+= one_hot(ch2) #char 2
    encoded_X+= one_hot(ch3) #char 3
    encoded_X+=[int(ch1==ch2==ch2)]
    encoded_X+=[int(ch1==ch2)]
    encoded_X+=[int(num[0]==num[1]==num[2])]
    encoded_X+=[int(num[0]==num[2])]
    encoded_X+=[num in nice_nums]
    encoded_X+=[(ch1+ch2+ch3) in nice_chars]
    #encoded_X+=[reg in big_reg]
    return encoded_X
    
df['features'] = df['plate'].apply(encode_plate)
X = np.vstack(df['features'].values)
y = df['price'].values 

In [90]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7],
}

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Quantile',  # нет готового SMAPE, но MAE близка
    eval_metric='Quantile',
    verbose=100,
    early_stopping_rounds=50,
    random_seed=42
)

#model.grid_search(grid, X=Pool(X_train, y_train), cv=3, verbose=True)

train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_val, y_val)

model.fit(train_pool, eval_set=valid_pool)

y_pred = model.predict(X_val)

print(f"SMAPE: {smape(y_val, y_pred):.2f}%")

0:	learn: 182165.2989720	test: 182132.2609516	best: 182132.2609516 (0)	total: 3.78ms	remaining: 3.78s
100:	learn: 145957.6153301	test: 146584.9002260	best: 146584.9002260 (100)	total: 383ms	remaining: 3.41s
200:	learn: 139194.0087309	test: 140945.2807983	best: 140945.2807983 (200)	total: 757ms	remaining: 3.01s
300:	learn: 135229.9890505	test: 137536.4331704	best: 137536.0244356 (299)	total: 1.14s	remaining: 2.65s
400:	learn: 132390.5539157	test: 135673.0176749	best: 135673.0176749 (400)	total: 1.51s	remaining: 2.26s
500:	learn: 130245.5284343	test: 134258.5344141	best: 134258.5344141 (500)	total: 1.89s	remaining: 1.88s
600:	learn: 128688.0678407	test: 133520.6092391	best: 133520.6092391 (600)	total: 2.27s	remaining: 1.51s
700:	learn: 127191.8082767	test: 132704.0324894	best: 132704.0324894 (700)	total: 2.65s	remaining: 1.13s
800:	learn: 125957.1389990	test: 132054.0803097	best: 132054.0803097 (800)	total: 3.04s	remaining: 757ms
900:	learn: 124965.3546745	test: 131803.8056477	best: 1317

In [92]:
df_test = pd.read_csv("/home/arta/pythonProjects/rus_car_plate_price_pred/test.csv")
df_test['features'] = df_test['plate'].apply(encode_plate)
X_test = np.vstack(df_test['features'].values)
preds = model.predict(X_test)

In [93]:
submission = pd.DataFrame({'id': df_test['id'], 'price': preds})
submission.to_csv('submission.csv', index=False)