In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor, CatBoostClassifier
import re
from sklearn.metrics import classification_report

In [2]:
VAL_SIZE   = 0.25   # 25%
N_FOLDS    = 5
RANDOM_SEED = 42

# CATBOOST
ITERATIONS = 2000
LR         = 0.05

In [3]:
def preproc_test(df):
    df = df.drop('brand', axis=1)
    df['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    df.drop(['vehicleConfiguration', 'description', 'Комплектация',
              'Руль','Состояние', 'Таможня', 'Владение', 'id', 
             'color', 'mileage', 'ПТС', 'Владельцы'], axis=1, inplace=True)
    df['engineDisplacement'] = df.engineDisplacement.apply(lambda x: x.split(' ')[0])
    df['modelDate'] = df.modelDate.apply(lambda x: int(x))
    df['numberOfDoors'] = df.numberOfDoors .apply(lambda x: int(x))
    df['Привод'] = df['Привод'].apply(lambda x: x.lower())
    df['enginePower'] = df.enginePower.apply(lambda x: int(x.split(' ')[0]))
    df['productionDate'] = df['productionDate'].apply(lambda x: int(x))
    return df

In [4]:
def preproc_data(dft):

    dft['name'] = df['name'].apply(lambda x: x.split(' ')[0])
    dft.drop(['brand', 'vehicleConfiguration', 'description', 'equipment', 'color', 'mileage', 'pts', 'owners',
              'wheel','state', 'customs', 'owningTime', 'Unnamed: 0', 'name_full', 'price'], axis=1, inplace=True)
    dft['engineDisplacement'] = dft.engineDisplacement.apply(lambda x: x.split(' ')[0])
    dft['drive'] = dft.drive.apply(lambda x: x.lower())
    dft['enginePower'] = dft.enginePower.apply(lambda x: int(x.split(' ')[0]))
    return dft

In [5]:
df = pd.read_csv(r'C:\Users\Alex\PycharmProjects\untitled\data\auto_data_x_v2.csv')

In [6]:
dft = pd.read_csv('test.csv')

In [7]:
X_for_pred = preproc_test(dft)
X = preproc_data(df)

In [8]:
y = X.iloc[:,-1]
X = X.iloc[:,:-1]
X_for_pred.columns = X.columns

In [9]:
model_list = y.value_counts().index.to_list()

In [10]:
model_list

['5', '3', 'x5', 'x3', '7', 'x6', 'x1', '1', 'x4', 'x7', '6', 'i3', 'i8']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, 
    random_state=RANDOM_SEED, stratify=y)

In [12]:
cat_features = ['bodyType', 'fuelType', 'name', 'vehicleTransmission', 'engineDisplacement', 'drive']

In [13]:
cls = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=None, class_names = model_list)

In [17]:
cls.fit(X_train, y_train, cat_features=cat_features, verbose=False)

<catboost.core.CatBoostClassifier at 0x2b50d1934c8>

In [15]:
cls.save_model('model_feature_generator_v2')

In [19]:
y_pred = cls.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.99      1.00      0.99        74
           3       0.99      0.99      0.99       333
           5       1.00      1.00      1.00       481
           6       0.98      0.98      0.98        46
           7       0.99      0.99      0.99       156
          i3       1.00      1.00      1.00         3
          i8       0.00      0.00      0.00         1
          x1       1.00      1.00      1.00       120
          x3       0.91      0.93      0.92       167
          x4       1.00      0.55      0.71        49
          x5       0.82      0.99      0.90       259
          x6       0.97      1.00      0.98       129
          x7       0.86      0.12      0.21        49

    accuracy                           0.96      1867
   macro avg       0.88      0.81      0.82      1867
weighted avg       0.96      0.96      0.95      1867

