# Выделение целевого признака

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
data = pd.read_csv('../data/dataset.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
data

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0,0,1,0,190000,2010,0,0,0,2.5,...,1,1,1,0,1,0,1,1,1,16
1,0,0,1,1,290000,2002,0,0,0,3.0,...,1,0,0,1,1,0,0,0,1,83
2,0,1,1,2,402000,2001,0,0,0,2.5,...,1,0,0,0,0,0,0,1,1,151
3,0,2,0,1,10000,1999,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,86
4,0,3,1,3,280000,2001,0,0,0,2.5,...,1,0,1,1,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,54,1108,1,0,290000,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,301
38527,54,1111,0,1,321000,2004,2,0,1,2.2,...,1,0,0,1,1,0,0,1,1,317
38528,54,1108,1,1,777957,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,369
38529,54,1111,0,3,20000,2001,0,0,0,2.0,...,1,0,0,0,0,0,0,0,1,490


In [5]:
data['price_usd'].value_counts()

1500.00    637
3500.00    568
2000.00    561
1000.00    552
2500.00    546
          ... 
6053.23      1
9130.00      1
8661.20      1
4097.51      1
5666.00      1
Name: price_usd, Length: 2677, dtype: int64

In [6]:
y = data["price_usd"]
X = data.drop(["price_usd"], axis=1)

In [7]:
y

0        10900.00
1         5000.00
2         2800.00
3         9999.00
4         2134.11
           ...   
38526     2750.00
38527     4800.00
38528     4300.00
38529     4000.00
38530     3200.00
Name: price_usd, Length: 38531, dtype: float64

In [8]:
X

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0,0,1,0,190000,2010,0,0,0,2.5,...,1,1,1,0,1,0,1,1,1,16
1,0,0,1,1,290000,2002,0,0,0,3.0,...,1,0,0,1,1,0,0,0,1,83
2,0,1,1,2,402000,2001,0,0,0,2.5,...,1,0,0,0,0,0,0,1,1,151
3,0,2,0,1,10000,1999,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,86
4,0,3,1,3,280000,2001,0,0,0,2.5,...,1,0,1,1,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,54,1108,1,0,290000,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,301
38527,54,1111,0,1,321000,2004,2,0,1,2.2,...,1,0,0,1,1,0,0,1,1,317
38528,54,1108,1,1,777957,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,369
38529,54,1111,0,3,20000,2001,0,0,0,2.0,...,1,0,0,0,0,0,0,0,1,490


## Разделение данных

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30824, 29), (30824,), (7707, 29), (7707,))

# Линейная регрессия

In [12]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt

In [13]:
lr = LinearRegression().fit(X_train, y_train)

In [14]:
lr.predict(X_test)

array([ 2741.95910545, -1685.79683273,   519.22252408, ...,
        6500.05503955,  8304.17579679,  3968.26001369])

In [15]:
y_pred = lr.predict(X_test)

In [16]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr.score(X_test, y_test)}')

MAE: 2298.923394382889
MSE: 12629450.236637004
RMSE: 3553.793780825922
MAPE: 1.753558340358722
R^2: 0.6933557709969049


In [17]:
len(lr.coef_)
lr.coef_

array([-1.80439014e+01,  2.16495313e+00,  8.40157376e+02,  1.11511387e+02,
       -5.39414488e-03,  3.73523317e+02,  2.82169716e+02, -6.33103311e+02,
        7.20641250e+02,  1.69650498e+03,  1.47043040e+02,  5.86808631e+03,
        2.56793629e+03, -9.91977883e+02, -2.02946061e+02, -1.14676098e+02,
        7.79757789e+01,  1.95548699e-01,  6.25004550e+02, -9.03335541e+02,
        6.26263094e+02,  7.50449776e+02,  3.03744842e+02,  2.67746507e+02,
        1.00484148e+03,  1.30724886e+03,  3.30952839e+02,  2.15975294e+02,
        1.73669863e+00])

In [18]:
# L1
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 2298.951587679401
MSE: 12629531.36474868
RMSE: 3553.80519510407
MAPE: 1.753957282341387
R^2: 0.6933538011987912


array([-1.80111319e+01,  2.16353243e+00,  8.40164406e+02,  1.11513491e+02,
       -5.39462261e-03,  3.73564469e+02,  2.82253937e+02, -6.32876198e+02,
        7.20428142e+02,  1.69647605e+03,  1.47016465e+02,  5.85018395e+03,
        2.57272159e+03, -9.91945345e+02, -2.02693433e+02, -1.14687249e+02,
        7.79847474e+01,  1.94713357e-01,  6.25058801e+02, -9.03526700e+02,
        6.26255130e+02,  7.50322459e+02,  3.03835149e+02,  2.67630529e+02,
        1.00478099e+03,  1.30703761e+03,  3.30839213e+02,  2.15955887e+02,
        1.73645988e+00])

In [19]:
# L2
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 2298.6772245586662
MSE: 12629237.961283892
RMSE: 3553.763914680306
MAPE: 1.7545113587834533
R^2: 0.6933609250622659


array([-1.78195028e+01,  2.15602245e+00,  8.38915402e+02,  1.11506774e+02,
       -5.39891280e-03,  3.73860559e+02,  2.82338395e+02, -6.18787805e+02,
        7.18543226e+02,  1.69695920e+03,  1.46547491e+02,  5.79243466e+03,
        2.58043342e+03, -9.90195130e+02, -1.99246062e+02, -1.14672539e+02,
        7.80470636e+01,  1.93246249e-01,  6.18977869e+02, -9.03124603e+02,
        6.25278693e+02,  7.49290016e+02,  3.03539361e+02,  2.65589746e+02,
        1.00387421e+03,  1.30562240e+03,  3.28584318e+02,  2.11652845e+02,
        1.73496170e+00])

# Типа подобрал?

In [20]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [21]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [22]:
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
# выводим оптимальные значения параметров
#GridSearchCV
ridge_optimal.best_params_

{'alpha': 0.9}

In [23]:
# L1
ridge = Ridge(alpha=0.9).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 2298.9739377659616
MSE: 12629597.513173532
RMSE: 3553.814501795716
MAPE: 1.7542736258303826
R^2: 0.6933521951088697


array([-1.79850416e+01,  2.16240144e+00,  8.40169880e+02,  1.11515168e+02,
       -5.39500350e-03,  3.73597234e+02,  2.82321471e+02, -6.32694751e+02,
        7.20257404e+02,  1.69645278e+03,  1.46995280e+02,  5.83596119e+03,
        2.57651562e+03, -9.91919227e+02, -2.02492272e+02, -1.14696144e+02,
        7.79918829e+01,  1.94050843e-01,  6.25102143e+02, -9.03678457e+02,
        6.26248767e+02,  7.50221001e+02,  3.03907324e+02,  2.67538223e+02,
        1.00473262e+03,  1.30686887e+03,  3.30748742e+02,  2.15940293e+02,
        1.73626986e+00])

In [24]:
# L2
lasso = Lasso(alpha=0.9).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 2298.4827042344277
MSE: 12629147.246343426
RMSE: 3553.7511514375096
MAPE: 1.755273474146068
R^2: 0.6933631276294764


array([-1.76399838e+01,  2.14887790e+00,  8.37921824e+02,  1.11503083e+02,
       -5.40272715e-03,  3.74130353e+02,  2.82473324e+02, -6.07335384e+02,
        7.16864836e+02,  1.69732257e+03,  1.46151051e+02,  5.73191335e+03,
        2.59043112e+03, -9.88768927e+02, -1.96286064e+02, -1.14669691e+02,
        7.81040913e+01,  1.91404289e-01,  6.14156525e+02, -9.02955853e+02,
        6.24491173e+02,  7.48362209e+02,  3.03374977e+02,  2.63864336e+02,
        1.00310039e+03,  1.30432124e+03,  3.26689502e+02,  2.08194886e+02,
        1.73357216e+00])

# Полиномы

In [25]:
yy = data["price_usd"]
XX = data.drop(["price_usd"], axis=1)

In [26]:
lr = LinearRegression().fit(XX, yy)
lr.coef_

array([-1.81220468e+01,  2.18858726e+00,  8.35720469e+02,  1.09494063e+02,
       -5.47687928e-03,  3.68676615e+02,  2.56272528e+02, -6.13347402e+02,
        7.61107216e+02,  1.69254034e+03,  1.45185053e+02,  6.04636359e+03,
        2.37403743e+03, -9.73205705e+02, -1.97772307e+02, -1.12778004e+02,
        8.41602979e+01, -3.61912858e-01,  5.92447383e+02, -9.19761533e+02,
        5.91326303e+02,  7.41186828e+02,  3.09695279e+02,  2.60535493e+02,
        1.06118213e+03,  1.33028038e+03,  3.21568285e+02,  2.26687831e+02,
        1.86936657e+00])

In [27]:
from sklearn.preprocessing import PolynomialFeatures

In [28]:
# создаем объект, который позволит расширить множество предикторов
pf = PolynomialFeatures(2)  
# добавляем новые предикторы
X_p=pf.fit_transform(XX) 
X_p

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.60000e+01, 2.56000e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        8.30000e+01, 6.88900e+03],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        1.51000e+02, 2.28010e+04],
       ...,
       [1.00000e+00, 5.40000e+01, 1.10800e+03, ..., 1.00000e+00,
        3.69000e+02, 1.36161e+05],
       [1.00000e+00, 5.40000e+01, 1.11100e+03, ..., 1.00000e+00,
        4.90000e+02, 2.40100e+05],
       [1.00000e+00, 5.40000e+01, 1.10600e+03, ..., 1.00000e+00,
        6.32000e+02, 3.99424e+05]])

In [29]:
lr2 = LinearRegression().fit(X_p, yy)
lr2.coef_

array([ 6.64999190e+04,  2.53095255e+03, -1.76076375e+02, -1.79981231e+05,
       -1.67850552e+03,  6.58134448e-01, -4.95094027e+04, -1.24009176e+05,
        5.41906536e+04, -9.49915680e+04, -3.18186231e+05, -5.86332263e+03,
       -4.49046000e+05,  3.61708803e+05,  7.59940173e+04, -2.34112174e+03,
        6.26418785e+03, -5.19027209e+03, -1.13089909e+02, -6.27170234e+04,
        2.41940899e+04, -8.04777220e+04, -6.24877508e+04, -2.38027214e+04,
       -1.62278253e+04, -1.61680031e+05, -2.97792674e+04, -9.10216454e+04,
        5.17733289e+04, -2.21016352e+02,  3.22622689e+00, -7.32963581e-02,
        3.51035650e+00,  8.66251629e-01, -2.34493637e-05, -1.40207208e+00,
       -2.60875850e+02,  2.60624003e+02,  5.72155161e+02,  3.90732727e+01,
       -1.94821076e+00, -7.96352861e+01, -4.09653753e+01, -2.96854935e+00,
       -4.51454135e+00, -9.14809177e-01,  3.74109919e-01,  1.67599085e-01,
       -1.54779261e+01,  3.27044356e+00,  1.51435719e-01,  2.10740889e+00,
       -6.02145078e+00, -

In [30]:
power = 2

In [31]:
polynomial_regression_classic = LinearRegression().fit(PolynomialFeatures(power).fit_transform(X_train), y_train)
y_predicted = polynomial_regression_classic.predict(PolynomialFeatures(power).fit_transform(X_test))
print(f"MAE: {mean_absolute_error(y_pred, y_test)}",
      f'RMSE: {mean_squared_error(y_pred, y_test)}',
      f'MSE: {mean_squared_error(y_pred, y_test)**0.5}',
      f'MAPE: {mean_absolute_percentage_error(y_pred, y_test)}',
      f'R^2: {polynomial_regression_classic.score(PolynomialFeatures(power).fit_transform(X_test), y_test)}',
      sep='\n')

MAE: 2298.4827042344277
RMSE: 12629147.246343426
MSE: 3553.7511514375096
MAPE: 1.3276423770941557
R^2: 0.8574368006093522
