# Выделение целевого признака

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv('../data/dataset.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
data

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0,0,1,0,190000,2010,0,0,0,2.5,...,1,1,1,0,1,0,1,1,1,16
1,0,0,1,1,290000,2002,0,0,0,3.0,...,1,0,0,1,1,0,0,0,1,83
2,0,1,1,2,402000,2001,0,0,0,2.5,...,1,0,0,0,0,0,0,1,1,151
3,0,2,0,1,10000,1999,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,86
4,0,3,1,3,280000,2001,0,0,0,2.5,...,1,0,1,1,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,54,1108,1,0,290000,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,301
38527,54,1111,0,1,321000,2004,2,0,1,2.2,...,1,0,0,1,1,0,0,1,1,317
38528,54,1108,1,1,777957,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,369
38529,54,1111,0,3,20000,2001,0,0,0,2.0,...,1,0,0,0,0,0,0,0,1,490


In [4]:
data['price_usd'].value_counts()

1500.00    637
3500.00    568
2000.00    561
1000.00    552
2500.00    546
          ... 
6053.23      1
9130.00      1
8661.20      1
4097.51      1
5666.00      1
Name: price_usd, Length: 2677, dtype: int64

In [5]:
y = data["price_usd"]
X = data.drop(["price_usd"], axis=1)

In [6]:
y

0        10900.00
1         5000.00
2         2800.00
3         9999.00
4         2134.11
           ...   
38526     2750.00
38527     4800.00
38528     4300.00
38529     4000.00
38530     3200.00
Name: price_usd, Length: 38531, dtype: float64

In [7]:
X

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0,0,1,0,190000,2010,0,0,0,2.5,...,1,1,1,0,1,0,1,1,1,16
1,0,0,1,1,290000,2002,0,0,0,3.0,...,1,0,0,1,1,0,0,0,1,83
2,0,1,1,2,402000,2001,0,0,0,2.5,...,1,0,0,0,0,0,0,1,1,151
3,0,2,0,1,10000,1999,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,86
4,0,3,1,3,280000,2001,0,0,0,2.5,...,1,0,1,1,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,54,1108,1,0,290000,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,301
38527,54,1111,0,1,321000,2004,2,0,1,2.2,...,1,0,0,1,1,0,0,1,1,317
38528,54,1108,1,1,777957,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,369
38529,54,1111,0,3,20000,2001,0,0,0,2.0,...,1,0,0,0,0,0,0,0,1,490


## Разделение данных

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30824, 29), (30824,), (7707, 29), (7707,))

# Линейная регрессия

In [11]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt

In [12]:
lr = LinearRegression().fit(X_train, y_train)

In [13]:
lr.predict(X_test)

array([ 3440.43543096, 13952.23495258, 10738.65075442, ...,
       13126.25496241,  6481.5497847 ,  7419.55273762])

In [14]:
y_pred = lr.predict(X_test)

In [15]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr.score(X_test, y_test)}')

MAE: 2274.756875105806
MSE: 12959036.2829883
RMSE: 3599.866147926656
MAPE: 1.7740119219313892
R^2: 0.6936141850330264


In [16]:
len(lr.coef_)
lr.coef_

array([-1.81862777e+01,  2.23100633e+00,  8.07824251e+02,  1.15157705e+02,
       -5.44839728e-03,  3.69978942e+02,  2.58081233e+02, -5.77576816e+02,
        7.70689549e+02,  1.68323731e+03,  1.49196834e+02,  6.33185922e+03,
        2.26102934e+03, -9.75097165e+02, -2.07285569e+02, -1.13864576e+02,
        8.42662614e+01, -4.20950643e-01,  6.04500233e+02, -9.28245751e+02,
        5.83776637e+02,  6.97591329e+02,  3.02216065e+02,  2.87101165e+02,
        1.00747841e+03,  1.31315917e+03,  3.58797872e+02,  2.23236304e+02,
        1.96875440e+00])

In [17]:
# L1
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 2274.781344271028
MSE: 12958802.87893849
RMSE: 3599.833729346189
MAPE: 3.148109857324376
R^2: 0.6936197033206873


array([-1.81500085e+01,  2.22943133e+00,  8.07838278e+02,  1.15161596e+02,
       -5.44891119e-03,  3.70022378e+02,  2.58237173e+02, -5.77439062e+02,
        7.70333893e+02,  1.68321743e+03,  1.49169231e+02,  6.31161290e+03,
        2.26685407e+03, -9.75057771e+02, -2.07019323e+02, -1.13877822e+02,
        8.42756517e+01, -4.22017559e-01,  6.04584459e+02, -9.28426105e+02,
        5.83794094e+02,  6.97450154e+02,  3.02299143e+02,  2.86980965e+02,
        1.00742817e+03,  1.31293521e+03,  3.58667367e+02,  2.23224615e+02,
        1.96849189e+00])

In [18]:
# L2
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 2274.5463594885
MSE: 12958648.43434472
RMSE: 3599.812277653478
MAPE: 3.1494405290942766
R^2: 0.6936233547984436


array([-1.79572851e+01,  2.22201780e+00,  8.06691686e+02,  1.15157527e+02,
       -5.45327361e-03,  3.70318512e+02,  2.58211012e+02, -5.63536370e+02,
        7.68714901e+02,  1.68374260e+03,  1.48699652e+02,  6.25555077e+03,
        2.27459872e+03, -9.73252181e+02, -2.03556736e+02, -1.13869771e+02,
        8.43316559e+01, -4.22859905e-01,  5.98655535e+02, -9.27868036e+02,
        5.82868799e+02,  6.96388170e+02,  3.01945750e+02,  2.84971440e+02,
        1.00652385e+03,  1.31146610e+03,  3.56380180e+02,  2.18919826e+02,
        1.96670270e+00])

# Типа подобрал?

In [19]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [20]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [21]:
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
# выводим оптимальные значения параметров
#GridSearchCV
ridge_optimal.best_params_

{'alpha': 0.9}

In [34]:
# L1
ridge = Ridge(alpha=0.9).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 2274.8007641298805
MSE: 12958620.482620075
RMSE: 3599.8083952649586
MAPE: 3.1488966905922404
R^2: 0.6936240156509752


array([-1.81211417e+01,  2.22817792e+00,  8.07849345e+02,  1.15164696e+02,
       -5.44932075e-03,  3.70056953e+02,  2.58361881e+02, -5.77328927e+02,
        7.70049529e+02,  1.68320134e+03,  1.49147235e+02,  6.29553116e+03,
        2.27147312e+03, -9.75026210e+02, -2.06807348e+02, -1.13888375e+02,
        8.42831210e+01, -4.22863849e-01,  6.04651638e+02, -9.28569197e+02,
        5.83807947e+02,  6.97337756e+02,  3.02365572e+02,  2.86885285e+02,
        1.00738795e+03,  1.31275636e+03,  3.58563482e+02,  2.23215144e+02,
        1.96828296e+00])

In [23]:
y_pred

array([ 3440.09844822, 13951.98867974, 10739.65064603, ...,
       13126.76225363,  6480.98583889,  7419.24956133])

In [24]:
y_test

23121     3700.00
7130     11600.00
23511    12500.00
32146    39279.62
14920      800.00
           ...   
1071      6700.00
32659     9437.54
807       9000.00
35666     5500.00
24039     8800.00
Name: price_usd, Length: 7707, dtype: float64

In [25]:
# L2
lasso = Lasso(alpha=0.9).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 2274.3852488882194
MSE: 12958413.761743797
RMSE: 3599.7796823894373
MAPE: 3.1513012668181317
R^2: 0.6936289030780003


array([-1.77740911e+01,  2.21482697e+00,  8.05785636e+02,  1.15157384e+02,
       -5.45717467e-03,  3.70590168e+02,  2.58314785e+02, -5.52303960e+02,
        7.67135284e+02,  1.68414684e+03,  1.48301906e+02,  6.19450401e+03,
        2.28545422e+03, -9.71776193e+02, -2.00573670e+02, -1.13873928e+02,
        8.43839714e+01, -4.24387316e-01,  5.93979777e+02, -9.27565864e+02,
        5.82142528e+02,  6.95425642e+02,  3.01729498e+02,  2.83267660e+02,
        1.00576020e+03,  1.31011164e+03,  3.54446027e+02,  2.15466645e+02,
        1.96506134e+00])

# Полиномы

In [26]:
yy = data["price_usd"]
XX = data.drop(["price_usd"], axis=1)

In [27]:
lr = LinearRegression().fit(XX, yy)
lr.coef_

array([-1.81220468e+01,  2.18858726e+00,  8.35720469e+02,  1.09494063e+02,
       -5.47687928e-03,  3.68676615e+02,  2.56272528e+02, -6.13347402e+02,
        7.61107216e+02,  1.69254034e+03,  1.45185053e+02,  6.04636359e+03,
        2.37403743e+03, -9.73205705e+02, -1.97772307e+02, -1.12778004e+02,
        8.41602979e+01, -3.61912858e-01,  5.92447383e+02, -9.19761533e+02,
        5.91326303e+02,  7.41186828e+02,  3.09695279e+02,  2.60535493e+02,
        1.06118213e+03,  1.33028038e+03,  3.21568285e+02,  2.26687831e+02,
        1.86936657e+00])

In [28]:
from sklearn.preprocessing import PolynomialFeatures

In [29]:
# создаем объект, который позволит расширить множество предикторов
pf = PolynomialFeatures(2)  
# добавляем новые предикторы
X_p=pf.fit_transform(XX) 
X_p

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.60000e+01, 2.56000e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        8.30000e+01, 6.88900e+03],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        1.51000e+02, 2.28010e+04],
       ...,
       [1.00000e+00, 5.40000e+01, 1.10800e+03, ..., 1.00000e+00,
        3.69000e+02, 1.36161e+05],
       [1.00000e+00, 5.40000e+01, 1.11100e+03, ..., 1.00000e+00,
        4.90000e+02, 2.40100e+05],
       [1.00000e+00, 5.40000e+01, 1.10600e+03, ..., 1.00000e+00,
        6.32000e+02, 3.99424e+05]])

In [30]:
lr2 = LinearRegression().fit(X_p, yy)
lr2.coef_

array([ 6.64999190e+04,  2.53095255e+03, -1.76076375e+02, -1.79981231e+05,
       -1.67850552e+03,  6.58134448e-01, -4.95094027e+04, -1.24009176e+05,
        5.41906536e+04, -9.49915680e+04, -3.18186231e+05, -5.86332263e+03,
       -4.49046000e+05,  3.61708803e+05,  7.59940173e+04, -2.34112174e+03,
        6.26418785e+03, -5.19027209e+03, -1.13089909e+02, -6.27170234e+04,
        2.41940899e+04, -8.04777220e+04, -6.24877508e+04, -2.38027214e+04,
       -1.62278253e+04, -1.61680031e+05, -2.97792674e+04, -9.10216454e+04,
        5.17733289e+04, -2.21016352e+02,  3.22622689e+00, -7.32963581e-02,
        3.51035650e+00,  8.66251629e-01, -2.34493637e-05, -1.40207208e+00,
       -2.60875850e+02,  2.60624003e+02,  5.72155161e+02,  3.90732727e+01,
       -1.94821076e+00, -7.96352861e+01, -4.09653753e+01, -2.96854935e+00,
       -4.51454135e+00, -9.14809177e-01,  3.74109919e-01,  1.67599085e-01,
       -1.54779261e+01,  3.27044356e+00,  1.51435719e-01,  2.10740889e+00,
       -6.02145078e+00, -

In [31]:
power = 2

In [32]:
polynomial_regression_classic = LinearRegression().fit(PolynomialFeatures(power).fit_transform(X_train), y_train)
y_predicted = polynomial_regression_classic.predict(PolynomialFeatures(power).fit_transform(X_test))
print(f"MAE: {mean_absolute_error(y_pred, y_test)}",
      f'RMSE: {mean_squared_error(y_pred, y_test)}',
      f'MSE: {mean_squared_error(y_pred, y_test)**0.5}',
      f'MAPE: {mean_absolute_percentage_error(y_pred, y_test)}',
      f'R^2: {polynomial_regression_classic.score(PolynomialFeatures(power).fit_transform(X_test), y_test)}',
      sep='\n')

MAE: 2274.3852488882194
RMSE: 12958413.761743797
MSE: 3599.7796823894373
MAPE: 1.5270895587486437
R^2: 0.8462805890794112
