загрузите датасет для регрессии, выделите целевой признак и предикторы, разбейте данные на обучающую и тестовую выборку;
решите задачу регрессии на ваших данных с использованием моделей sklearn (линейная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
решите задачу регрессии на ваших данных с использованием моделей sklearn (полиномиальная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
вычислите значения метрик  R2 , MAE, MSE, RMSE, MAPE для всех обученных моделей; выберите лучшую модель;
самостоятельно реализуйте (желательно в виде класса) модель линейной регрессии с регуляризацией (можете выбрать L1 или L2);
самостоятельно реализуйте вычисление всех используемых метрик (в виде функций, принимающих два аргумента);
обучите вашу модель линейной регрессии на ваших данных; оцените качество с помощью реализованных вами метрик.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from category_encoders.binary import BinaryEncoder
from math import sqrt

In [21]:
import warnings 
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('../data/regression/moldova_cars_task.csv')
data = data.drop(["Model", "Make"], axis=1) #
data = data.drop_duplicates()
for column in list(data):
    if data[column].dtype == 'float64':
        vec = data[column]
        vecnonan = vec[np.isfinite(vec)]
        avg = round(sum(vecnonan)/len(vecnonan))

        newColumns = list(map(lambda x: avg if np.isnan(x) else x, list(data[column])))
        data[column] = newColumns
data = data.dropna()
data.reset_index(drop= True, inplace= True)
data

Unnamed: 0,Year,Style,Distance,Engine_capacity(cm3),Fuel_type,Transmission,Price(euro)
0,2011.0,Hatchback,195000.0,1800.0,Hybrid,Automatic,7750.0
1,2014.0,Universal,135000.0,1500.0,Diesel,Manual,8550.0
2,1998.0,Hatchback,1.0,1400.0,Petrol,Manual,2200.0
3,2012.0,Universal,110000.0,1500.0,Diesel,Manual,6550.0
4,2006.0,Universal,200000.0,1600.0,Metan/Propan,Manual,4100.0
...,...,...,...,...,...,...,...
36957,2002.0,Crossover,225000.0,1800.0,Metan/Propan,Manual,4400.0
36958,2015.0,Universal,89000.0,1500.0,Diesel,Manual,7000.0
36959,2009.0,Hatchback,225.0,1500.0,Diesel,Manual,4500.0
36960,2006.0,Combi,370000.0,2000.0,Diesel,Manual,4000.0


In [4]:
label = LabelEncoder()
bn = BinaryEncoder()

label.fit(data.Transmission)
data.Transmission = label.transform(data.Transmission)

df_bn = bn.fit_transform(data.Style)
data = data.drop(["Style"], axis=1)
data = data.join(df_bn)

data = pd.get_dummies(data, columns = ['Fuel_type'])
data.to_csv('../data/regression/moldova_cars_task_preprocessed.csv')
data

Unnamed: 0,Year,Distance,Engine_capacity(cm3),Transmission,Price(euro),Style_0,Style_1,Style_2,Style_3,Fuel_type_Diesel,Fuel_type_Electric,Fuel_type_Hybrid,Fuel_type_Metan/Propan,Fuel_type_Petrol,Fuel_type_Plug-in Hybrid
0,2011.0,195000.0,1800.0,0,7750.0,0,0,0,1,0,0,1,0,0,0
1,2014.0,135000.0,1500.0,1,8550.0,0,0,1,0,1,0,0,0,0,0
2,1998.0,1.0,1400.0,1,2200.0,0,0,0,1,0,0,0,0,1,0
3,2012.0,110000.0,1500.0,1,6550.0,0,0,1,0,1,0,0,0,0,0
4,2006.0,200000.0,1600.0,1,4100.0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36957,2002.0,225000.0,1800.0,1,4400.0,0,1,1,1,0,0,0,1,0,0
36958,2015.0,89000.0,1500.0,1,7000.0,0,0,1,0,1,0,0,0,0,0
36959,2009.0,225.0,1500.0,1,4500.0,0,0,0,1,1,0,0,0,0,0
36960,2006.0,370000.0,2000.0,1,4000.0,1,0,1,0,1,0,0,0,0,0


In [5]:
df = pd.read_csv('../data/regression/moldova_cars_task_preprocessed.csv', index_col=0)
y = df["Price(euro)"]
X = df.drop(["Price(euro)"], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((29569, 14), (29569,), (7393, 14), (7393,))

In [25]:
lr = LinearRegression().fit(X_train, y_train)
lr.predict(X_test)

array([19699.43559586,  2986.92420007,  4887.43779728, ...,
       22799.48716161, 14898.77081445,  6816.78226517])

In [26]:
y_pred = lr.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr.score(X_test, y_test)}')

MAE: 4275.279965729946
MSE: 65254221.787827715
RMSE: 8078.008528581021
MAPE: 1.3674403271034203
R^2: 0.38117033756274477


In [27]:
parameters = {'alpha': np.arange(0, 1, 0.1)}
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

{'alpha': 0.9}

In [28]:
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

{'alpha': 0.9}

In [29]:
ridge = Ridge(alpha=0.9).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {ridge.score(X_test, y_test)}')

MAE: 4275.216916948012
MSE: 65254441.28230209
RMSE: 8078.022114496969
MAPE: 1.8697783922275357
R^2: 0.38116825601632887


In [30]:
lasso_optimal = RandomizedSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_optimal.best_params_

{'alpha': 0.4}

In [31]:
lasso = Lasso(alpha=0.4).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {lasso.score(X_test, y_test)}')

MAE: 4274.910777687076
MSE: 65254579.69294875
RMSE: 8078.030681604815
MAPE: 1.8697808086643835
R^2: 0.38116694341752066


In [32]:
poly = PolynomialFeatures(2)
X_p=poly.fit_transform(X_train)
X_test_poly=poly.fit_transform(X_test)
lr2 = LinearRegression().fit(X_p, y_train)

y_pred = lr2.predict(X_test_poly)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {lr2.score(X_test_poly, y_test)}')

MAE: 4779.744276144833
MSE: 78575400.86817998
RMSE: 8864.27666920319
MAPE: 2.246217152973949
R^2: 0.2548407219807176


In [33]:
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_p, y_train)
ridge_optimal.best_params_

{'alpha': 0.4}

In [34]:
ridge = Ridge(alpha=0.4).fit(X_p, y_train)
y_pred = ridge.predict(X_test_poly)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test_poly, y_test)}')

MAE: 3637.2329738081467
MSE: 51995699.422415406
RMSE: 7210.804353358605
MAPE: 1.186708466011863
R^2: 0.5069057565902289


In [35]:
lasso_optimal = RandomizedSearchCV(Lasso(), parameters).fit(X_p, y_train)
lasso_optimal.best_params_

{'alpha': 0.9}

In [36]:
lasso = Lasso(alpha=0.9).fit(X_p, y_train)
y_pred = lasso.predict(X_test_poly)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test_poly, y_test)}')

MAE: 4105.459807340926
MSE: 61697966.59482976
RMSE: 7854.805318709673
MAPE: 1.232718986312811
R^2: 0.414895606830061
