In [248]:
import torch
import torch.nn as nn

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor, Pool, cv
from catboost.utils import get_roc_curve

import matplotlib.pyplot as plt
import sys
import os
sys.path.append(os.path.abspath("/home/arta/pythonProjects/rus_car_plate_price_pred/"))
from supplemental_english import GOVERNMENT_CODES

In [326]:
df = pd.read_csv("/home/arta/pythonProjects/rus_car_plate_price_pred/train.csv")
df['year'] = df['date'].apply(lambda x: int(x[:4])-2021)
df['num'] = df['plate'].apply(lambda x: x[1:4])
df['char'] = df['plate'].apply(lambda x: x[0]+x[4:6])
df['num'] = df['plate'].apply(lambda x: x[1:4])
df['reg'] = df['plate'].apply(lambda x: x[6:] if len(x)==9 else '0'+x[6:])
df['price'] =df['price'].apply(np.log1p)
df.head()

Unnamed: 0,id,plate,date,price,year,num,char,reg
0,1,X059CP797,2024-12-26 00:00:00,11.082158,3,59,XCP,797
1,2,Y800MH790,2024-07-12 21:31:37,11.512935,3,800,YMH,790
2,3,A212TX77,2024-04-18 00:00:00,12.57764,3,212,ATX,77
3,4,P001AY199,2025-01-03 00:27:15,13.42985,4,1,PAY,199
4,5,P001AY199,2025-01-10 09:32:41,13.52783,4,1,PAY,199


In [328]:
num_to_prs = {}
reg_to_prs = {}
char_to_prs = {}

for num, char, reg, price in zip(df.num, df.char, df.reg, df.price):
    if num in num_to_prs:
        num_to_prs[num].append(price)
    else:
        num_to_prs[num] = [price]
    if reg in reg_to_prs:
        reg_to_prs[reg].append(price)
    else:
        reg_to_prs[reg] = [price] 
    if char in char_to_prs:
        char_to_prs[char].append(price)
    else:
        char_to_prs[char] = [price]

num_to_mid = {num:sorted(arr)[len(arr)//2] for num, arr in zip(num_to_prs.keys(), num_to_prs.values())}
reg_to_mid = {reg:sorted(arr)[len(arr)//2] for reg, arr in zip(reg_to_prs.keys(), reg_to_prs.values())}
char_to_mid = {char:sorted(arr)[len(arr)//2] for char, arr in zip(char_to_prs.keys(), char_to_prs.values())}

num_to_avg = {num:sum(arr)/len(arr) for num, arr in zip(num_to_prs.keys(), num_to_prs.values())}
reg_to_avg = {reg:sum(arr)/len(arr) for reg, arr in zip(reg_to_prs.keys(), reg_to_prs.values())}
char_to_avg = {char:sum(arr)/len(arr) for char, arr in zip(char_to_prs.keys(), char_to_prs.values())}

num_mid_arr = sorted(num_to_mid.items(), key=lambda x: x[1])
char_mid_arr = sorted(char_to_mid.items(), key=lambda x: x[1])
reg_mid_arr = sorted(reg_to_mid.items(), key=lambda x: x[1])

num_sorted = [num[0] for num in num_mid_arr]
char_sorted = [char[0] for char in char_mid_arr]
reg_sorted = [reg[0] for reg in reg_mid_arr]

In [358]:
FULL_ALPH = "ABEKMHOPCTYX0123456789"
ch2idx = {ch:i for i, ch in enumerate(FULL_ALPH)}

def get_supplemental(plate):
    num = int(plate[1:4])
    char = plate[0] + plate[4] + plate[5]
    reg = plate[6:]
    supplemental = [0, 0, 0]
    for sup in GOVERNMENT_CODES:
        if char == sup[0] and reg == sup[2] and sup[1][0] <= num <= sup[1][1]:
            supplemental = GOVERNMENT_CODES[sup][1:]
    return supplemental

def encode_plate(plate):
    if len(plate)!=9:
        plate = plate[:6] + "0" + plate[6:]
    num = plate[1:4]
    ch1 = plate[0]
    ch2 = plate[4]
    ch3 = plate[5]
    reg = plate[6:]
    encoded_X = [ch2idx[ch] for ch in plate]
    encoded_X+=[int(ch1==ch2==ch2)]
    encoded_X+=[int(ch1==ch2)]
    encoded_X+=[int(ch3==ch2)]
    encoded_X+=[int(num[0]==num[1]==num[2])]
    encoded_X+=[int(num[0]==num[2])]
    encoded_X+=[num_sorted.index(num), char_sorted.index(ch1+ch2+ch3), reg_sorted.index(reg)]
    encoded_X+=[num_to_mid[num]]
    encoded_X+=[char_to_mid[ch1+ch2+ch3]]
    encoded_X+=[reg_to_mid[reg]]
    return encoded_X

df['sup'] = df['plate'].apply(get_supplemental)
df['adv'] = df['sup'].apply(lambda x: x[1])
df['sig'] = df['sup'].apply(lambda x: x[2])
df['features'] = df['plate'].apply(encode_plate)
X = np.vstack(df['features'].values)
y = df['price'].values 
df

Unnamed: 0,id,plate,date,price,year,num,char,reg,sup,adv,sig,features
0,1,X059CP797,2024-12-26 00:00:00,11.082158,3,059,XCP,797,"[0, 0, 0]",0,0,"[11, 12, 17, 21, 8, 7, 19, 21, 19, 0, 0, 0, 0,..."
1,2,Y800MH790,2024-07-12 21:31:37,11.512935,3,800,YMH,790,"[0, 0, 0]",0,0,"[10, 20, 12, 12, 4, 5, 19, 21, 12, 0, 0, 0, 0,..."
2,3,A212TX77,2024-04-18 00:00:00,12.577640,3,212,ATX,077,"[0, 0, 0]",0,0,"[0, 14, 13, 14, 9, 11, 12, 19, 19, 0, 0, 0, 0,..."
3,4,P001AY199,2025-01-03 00:27:15,13.429850,4,001,PAY,199,"[0, 0, 0]",0,0,"[7, 12, 12, 13, 0, 10, 13, 21, 21, 0, 0, 0, 0,..."
4,5,P001AY199,2025-01-10 09:32:41,13.527830,4,001,PAY,199,"[0, 0, 0]",0,0,"[7, 12, 12, 13, 0, 10, 13, 21, 21, 0, 0, 0, 0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
51630,51631,X023PP797,2025-01-29 00:00:00,11.156265,4,023,XPP,797,"[0, 0, 0]",0,0,"[11, 12, 14, 15, 7, 7, 19, 21, 19, 0, 0, 1, 0,..."
51631,51632,M004KA161,2025-01-31 00:00:00,14.285515,4,004,MKA,161,"[0, 0, 0]",0,0,"[4, 12, 12, 16, 3, 0, 13, 18, 13, 0, 0, 0, 0, ..."
51632,51633,E888EB199,2025-02-06 00:00:00,13.652993,4,888,EEB,199,"[0, 0, 0]",0,0,"[2, 20, 20, 20, 2, 1, 13, 21, 21, 1, 1, 0, 1, ..."
51633,51634,X023XK77,2024-04-24 11:30:07,11.918397,3,023,XXK,077,"[0, 0, 0]",0,0,"[11, 12, 14, 15, 11, 3, 12, 19, 19, 1, 1, 0, 0..."


In [365]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7],
}

model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=6,
    loss_function='Quantile:alpha=0.5',
    eval_metric='Quantile:alpha=0.5',
    verbose=100,
    early_stopping_rounds=50,
    random_seed=42
)

#model.grid_search(grid, X=Pool(X_train, y_train), cv=3, verbose=False)

train_pool = Pool(X, y)
valid_pool = Pool(X_val, y_val)

model.fit(train_pool)#, eval_set=valid_pool)

y_pred = model.predict(X_val)

print(f"SMAPE: {smape(y_val, y_pred):.2f}%")

0:	learn: 0.4497445	total: 10.8ms	remaining: 16.2s
100:	learn: 0.2568347	total: 1.01s	remaining: 14.1s
200:	learn: 0.2455507	total: 2s	remaining: 12.9s
300:	learn: 0.2380176	total: 2.98s	remaining: 11.9s
400:	learn: 0.2331723	total: 3.96s	remaining: 10.9s
500:	learn: 0.2292305	total: 4.95s	remaining: 9.88s
600:	learn: 0.2259142	total: 5.87s	remaining: 8.78s
700:	learn: 0.2232835	total: 6.58s	remaining: 7.5s
800:	learn: 0.2209106	total: 7.16s	remaining: 6.25s
900:	learn: 0.2189353	total: 7.65s	remaining: 5.08s
1000:	learn: 0.2171420	total: 8.1s	remaining: 4.04s
1100:	learn: 0.2155545	total: 8.54s	remaining: 3.1s
1200:	learn: 0.2140826	total: 9.01s	remaining: 2.24s
1300:	learn: 0.2126610	total: 9.49s	remaining: 1.45s
1400:	learn: 0.2113938	total: 9.95s	remaining: 703ms
1499:	learn: 0.2102711	total: 10.4s	remaining: 0us
SMAPE: 3.50%


In [367]:
df_test = pd.read_csv("/home/arta/pythonProjects/rus_car_plate_price_pred/test.csv")
df_test['features'] = df_test['plate'].apply(encode_plate)
X_test = np.vstack(df_test['features'].values)
preds = np.expm1(model.predict(X_test))

In [368]:
submission = pd.DataFrame({'id': df_test['id'], 'price': preds})
submission.to_csv('submission.csv', index=False)