In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV

Загрузим датасет

In [2]:
diamonds_data = pd.read_csv("../data/diamonds/diamonds_preprocessed.csv", index_col = 0)
diamonds_data.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,326,3.95,3.98,2.43,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,327,4.05,4.07,2.31,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,0,...,1,0,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,335,4.34,4.35,2.75,0,1,0,...,0,1,0,0,0,1,0,0,0,0


Отделим признаки от цены

In [3]:
y = diamonds_data["price"]
diamonds_data.drop('price', axis =1, inplace = True)
X = diamonds_data
X

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,3.95,3.98,2.43,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,3.89,3.84,2.31,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,4.05,4.07,2.31,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,4.20,4.23,2.63,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,4.34,4.35,2.75,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53936,0.72,60.8,57.0,5.75,5.76,3.50,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
53937,0.72,63.1,55.0,5.69,5.75,3.61,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
53938,0.70,62.8,60.0,5.66,5.68,3.56,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
53939,0.86,61.0,58.0,6.15,6.12,3.74,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


Разделим датасет на обучающую и тестовую выборки

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Хелпер

In [5]:
def print_quality(score, mse, rmse, mae): 
    print(f"Коэффициент детерминации: {score}")
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')

# Линейная регрессия 
##### (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [6]:
lr = LinearRegression().fit(X_train, y_train)
lr_predict = lr.predict(X_train)
print_quality(lr.score(X, y), mean_squared_error(y_train, lr_predict), mean_squared_error(y_train, lr_predict, squared=False),
             mean_absolute_error(y_train, lr_predict))

Коэффициент детерминации: 0.920073459453038
MSE: 1274094.8050754203
RMSE: 1128.7580808461219
MAE: 737.7622774480712


# DTR
##### (https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

In [12]:
params = {'max_depth': np.arange(1, 20, 1),
          'max_features': np.arange(1, 25, 1)}
dtr = GridSearchCV(DecisionTreeRegressor(random_state=0), params).fit(X_train, y_train)
dtr.best_params_

{'max_depth': 13, 'max_features': 24}

In [13]:
dtr_predict = dtr.predict(X_train)
print_quality(dtr.score(X, y), mean_squared_error(y_train, dtr_predict), mean_squared_error(y_train, dtr_predict, squared=False),
             mean_absolute_error(y_train, dtr_predict))

Коэффициент детерминации: 0.9822741864541806
MSE: 215307.49770780746
RMSE: 464.01238960593224
MAE: 242.9803188057866


# Lasso
##### (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso)

Уберем варнинги, которые появляются при обучении модели

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
params = {'alpha': np.arange(0.1, 3, 0.05)}
lasso = GridSearchCV(Lasso(random_state=0), params).fit(X_train, y_train)
lasso.best_params_

{'alpha': 1.5500000000000005}

In [26]:
lasso_predict = lasso.predict(X_train)
print_quality(lasso.score(X, y), mean_squared_error(y_train, lasso_predict), 
              mean_squared_error(y_train, lasso_predict, squared=False),
             mean_absolute_error(y_train, lasso_predict))

Коэффициент детерминации: 0.9200308229828267
MSE: 1275145.0659229655
RMSE: 1129.2232135069512
MAE: 736.2401217333946


# Ridge
##### (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)

In [27]:
params = {'alpha': np.arange(0.1, 4, 0.05)}
ridge = GridSearchCV(Ridge(random_state=0), params).fit(X_train, y_train)
ridge.best_params_

{'alpha': 3.9500000000000015}

In [28]:
ridge_predict = ridge.predict(X_train)
print_quality(ridge.score(X, y), mean_squared_error(y_train, ridge_predict), 
              mean_squared_error(y_train, ridge_predict, squared=False),
             mean_absolute_error(y_train, ridge_predict))

Коэффициент детерминации: 0.9200841753064984
MSE: 1273985.1860637933
RMSE: 1128.7095224475574
MAE: 738.6833448378159


# ElasticNet
##### (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html)

In [31]:
params = {'alpha': np.arange(0.1, 3, 0.05), 'l1_ratio': np.arange(0.1, 1.1, 0.1)}
elastic = GridSearchCV(ElasticNet(random_state=0), params).fit(X_train, y_train)
elastic.best_params_

{'alpha': 1.5500000000000005, 'l1_ratio': 1.0}

In [32]:
elastic_predict = elastic.predict(X_train)
print_quality(elastic.score(X, y), mean_squared_error(y_train, elastic_predict), 
              mean_squared_error(y_train, elastic_predict, squared=False),
             mean_absolute_error(y_train, elastic_predict))

Коэффициент детерминации: 0.9200308229828267
MSE: 1275145.0659229655
RMSE: 1129.2232135069512
MAE: 736.2401217333946


# Итог

DTR оказался самым эффективным регрессором.