In [10]:
import numpy as np
import pandas as pd 
import seaborn as sns
import json

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder

In [11]:
from collections import defaultdict

In [28]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
df = pd.read_csv('cars.csv')

In [215]:
df.columns

Index(['manufacturer_name', 'model_name', 'transmission', 'color',
       'odometer_value', 'year_produced', 'engine_fuel', 'engine_has_gas',
       'engine_type', 'engine_capacity', 'body_type', 'has_warranty', 'state',
       'drivetrain', 'price_usd', 'is_exchangeable', 'location_region',
       'number_of_photos', 'up_counter', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8', 'feature_9', 'duration_listed'],
      dtype='object')

In [13]:
drop_columns = ['location_region', 'number_of_photos', 'up_counter', 'feature_0', 'feature_1', 'feature_2',
                'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 
                'duration_listed', 'engine_has_gas']
df.drop(drop_columns, axis=1, inplace=True)

In [14]:
categorical_features = ['manufacturer_name', 'model_name', 'transmission', 'color', 'engine_fuel', 'engine_type', 
                        'body_type',  'state', 'drivetrain']

In [15]:
df.replace('ВАЗ', 'VAZ', inplace=True)
df.replace('ГАЗ', 'GAZ', inplace=True)
df.replace('УАЗ', 'UAZ', inplace=True)
df.replace('ЗАЗ', 'ZAZ', inplace=True)
df.replace('Москвич', 'Moskvich', inplace=True)
df.replace('C-ELYSÉE', 'C-ELYSEE', inplace=True)

In [16]:
moskvich = df[df.manufacturer_name == 'Moskvich'].index
zaz = df[df.manufacturer_name == 'ZAZ'].index
gaz = df[df.manufacturer_name == 'GAZ'].index
vis = df[df.model_name == 'ВИС'].index

df.drop(moskvich, axis=0, inplace=True)
df.drop(zaz, axis=0, inplace=True)
df.drop(gaz, axis=0, inplace=True)
df.drop(vis, axis=0, inplace=True)

In [288]:
models_index = df.groupby('manufacturer_name')['model_name'].value_counts().index
models = defaultdict(list)

for brand, model in models_index:
    models[brand].append(model)

In [289]:
fuel_index = df.groupby(['manufacturer_name', 'model_name'])['engine_fuel'].value_counts().index
engine_index = df.groupby(['manufacturer_name', 'model_name'])['engine_type'].value_counts().index
transmission_index = df.groupby(['manufacturer_name', 'model_name'])['transmission'].value_counts().index
body_index = df.groupby(['manufacturer_name', 'model_name'])['body_type'].value_counts().index
drivetrain_index = df.groupby(['manufacturer_name', 'model_name'])['drivetrain'].value_counts().index

In [290]:
fuels = defaultdict(dict)
engines = defaultdict(dict)
transmissions = defaultdict(dict)
bodies = defaultdict(dict)
drivetrains = defaultdict(dict)

for key in models.keys():
    fuels[key] = defaultdict(list)
    engines[key] = defaultdict(list)
    transmissions[key] = defaultdict(list)
    bodies[key] = defaultdict(list)
    drivetrains[key] = defaultdict(list)

for brand, model, fuel in fuel_index:
    fuels[brand][model].append(fuel)
    
for brand, model, engine in engine_index:
    engines[brand][model].append(engine)
    
for brand, model, transmission in transmission_index:
    transmissions[brand][model].append(transmission)
    
for brand, model, body in body_index:
    bodies[brand][model].append(body)
    
for brand, model, drivetrain in drivetrain_index:
    drivetrains[brand][model].append(drivetrain)

In [291]:
for name, data in zip(['models', 'fuels', 'engines', 'transmissions', 'bodies', 'drivetrains'], 
                       [models, fuels, engines, transmissions, bodies, drivetrains]):
    with open(name + '.json', 'w') as outfile:
        json.dump(data, outfile)

In [292]:
for column in categorical_features:
    one_hot = defaultdict(int)
    for k, value in enumerate(np.unique(df[column])):
        one_hot[value] = k
    
    with open(column + '.json', 'w') as outfile:
        json.dump(one_hot, outfile)

In [293]:
for column in categorical_features:
    with open(column + '.json') as json_file:
        encoder = json.load(json_file)
        df[column] = df[column].map(encoder)

In [17]:
df.dropna(inplace=True)
X = df.drop(['price_usd'], 1)
y = df['price_usd']

In [192]:
rfr = RandomForestRegressor(n_estimators=50)

In [295]:
rfr.fit(X, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [175]:
import pickle

In [296]:
with open('model.pickle', 'wb') as f:
    pickle.dump(rfr, f)

In [265]:
df.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [283]:
df[df.manufacturer_name == 'VAZ']['model_name'].value_counts()

2106      89
2107      50
2109      49
2105      35
2108      33
2101      29
2121      29
21099     15
2114      14
2110      14
2104      13
2115      11
2103      10
21214     10
2131      10
21011     10
Vesta      9
21213      9
2112       8
21013      8
2102       6
1111       4
Priora     3
2111       2
Largus     2
2113       2
XRAY       1
1119       1
2120       1
Kalina     1
2123       1
Granta     1
Name: model_name, dtype: int64

In [272]:
df[df.manufacturer_name == 'BMW']['model_name'].value_counts()

525    354
X5     297
520    288
320    233
318    189
      ... 
X2       1
Z4       1
M2       1
Z3       1
235      1
Name: model_name, Length: 65, dtype: int64

In [7]:
import pickle

In [8]:
with open('model.pickle', 'rb') as fin:
    rfr = pickle.load(fin)

In [9]:
rfr.predict()

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [18]:
X.columns

Index(['manufacturer_name', 'model_name', 'transmission', 'color',
       'odometer_value', 'year_produced', 'engine_fuel', 'engine_type',
       'engine_capacity', 'body_type', 'has_warranty', 'state', 'drivetrain',
       'is_exchangeable'],
      dtype='object')

In [34]:
rfr.predict([[0, 0, 1, 3, 200000, 2009, 2, 1, 2.0, 3, 1, 1, 0, 1]])[0]

11759.117399999997

In [40]:
data = [('manufacturer_name', '0'), ('model_name', 'TSX'), ('engine_type', 'gasoline'), ('engine_fuel', 'gasoline'), ('engine_capacity', '2.5'), ('transmission', 'automatic'), ('drivetrain', 'front'), ('body_type', 'sedan'), ('color', '0'), ('state', 'Новая'), ('warranty', 'Да'), ('exchangable', 'Да'), ('odometer', '1960'), ('year_produced', '2019'), ('submit', 'Расчитать цену')]

In [39]:
def extract_columns(immutable_dict):
    ordered_columns = ['manufacturer_name', 'model_name', 'transmission', 'color', 'odometer',
                       'year_produced', 'engine_fuel', 'engine_type', 'engine_capacity', 'body_type',
                       'warranty', 'state', 'drivetrain', 'exchangable']
    mutable_dict = {item[0]:item[1] for item in immutable_dict}

    ordered_dict = {key: mutable_dict[key] for key in ordered_columns}

    return ordered_dict

In [42]:
with open('manufacturer_name.json') as json_file:
    brands = json.load(json_file)