## Задача 1

Реализовать класс для работы с линейной регрессией

In [None]:
import pandas as pd
import numpy as np

class MyLinearRegression:
    """
    Parameters
    ----------
    regularization : {None, 'l1', 'l2', 'l1l2'}, default=None
        Какую регуляризацию добавить к модели. Если значение `None`, то без регуляризации.

    weight_calc : {'matrix', 'gd', 'sgd'}, default='matrix'
        Каким образом вычислять вектор весов: матрично ('matrix'), градиентным спуском ('gd') или стохастическим градиентным спуском ('sgd'). При этом, при 'l1' или 'l1l2' нельзя использовать параметр 'matrix'.

    Attributes
    ----------
    coefs_ : Вектор коэффициентов размера (p, 1), где p — количество признаков.
    intercept_ : Значение коэффициента, отвечающего за смещение
    """
    def __init__(self, regularization=None, weight_calc='matrix', lambda_1=None, lambda_2=None, batch_size=20, random_state=42, max_iter=1000, learning_rate=0.01, early_stopping=0.0001):
        if regularization not in [None, 'l1', 'l2', 'l1l2']:
            raise TypeError(f"Параметр regularization не может принимать значение '{regularization}'")
        if weight_calc not in ['matrix', 'gd', 'sgd']:
            raise TypeError(f"Параметр weight_calc не может принимать значение '{weight_calc}'")
        if regularization in ['l1', 'l1l2'] and lambda_1 is None:
            raise TypeError(f"Значение коэффициента регулризации l1 не задано")
        if regularization in ['l2', 'l1l2'] and lambda_2 is None:
            raise TypeError(f"Значение коэффициента регулризации l2 не задано")
        if regularization in ['l1', 'l1l2'] and weight_calc == 'matrix':
            raise TypeError(f"Параметр weight_calc не может принимать значение 'matrix' при регуляризации'{regularization}'")

        self.regularization = regularization
        self.weight_calc = weight_calc
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        self.batch_size = batch_size
        self.random_state = random_state
        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.early_stopping = early_stopping

        self.coefs_ = None
        self.intercept_ = None


    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
      X = X.values
      y = y.values
      X_with_intercept = np.concatenate(([[1]]*X.shape[0], X), axis=1)
      np.random.seed(self.random_state)
      theta = np.random.uniform(-0.01, 0.01, X_with_intercept.shape[1])

      if self.weight_calc == 'matrix':
        if self.regularization == 'l2':
          indentity = np.eye(X_with_intercept.shape[1])
          indentity[0, 0] = 0
          theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept + self.lambda_2*indentity) @ X_with_intercept.T @ y
        else:
          theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y

      else:
        for _ in range(self.max_iter):
          if self.weight_calc == 'gd':
            gradient = -(2/X_with_intercept.shape[0])*X_with_intercept.T @ (y - X_with_intercept @ theta)
            if self.regularization in ['l1', 'l1l2']:
              gradient[1:] += self.lambda_1*np.sign(theta[1:])
            if self.regularization in ['l2', 'l1l2']:
              gradient[1:] += 2*self.lambda_2*theta[1:]
            if np.linalg.norm(gradient) < self.early_stopping:
              break
            theta -= self.learning_rate*gradient

          elif self.weight_calc == 'sgd':
            stop = False
            indices = np.random.permutation(X_with_intercept.shape[0])
            X_shuffled = X_with_intercept[indices]
            y_shuffled = y[indices]
            for j in range(0, X_with_intercept.shape[0]-self.batch_size, self.batch_size):
              gradient = -(2/self.batch_size)*X_shuffled[j:j+self.batch_size].T @ (y_shuffled[j:j+self.batch_size] - X_shuffled[j:j+self.batch_size] @ theta)
              if self.regularization in ['l1', 'l1l2']:
                gradient[1:] += self.lambda_1*np.sign(theta[1:])
              if self.regularization in ['l2', 'l1l2']:
                gradient[1:] += 2*self.lambda_2*theta[1:]
              if np.linalg.norm(gradient) < self.early_stopping:
                stop = True
                break
              theta -= self.learning_rate*gradient
            if stop:
              break

      self.intercept_ = theta[0]
      self.coefs_ = theta[1:]
      return self


    def predict(self, X: np.array, ss=True):
      if self.coefs_ is None:
        raise ValueError("Модель должна быть обучена перед предсказанием")
      return X @ self.coefs_ + self.intercept_


    def score(self, X: np.array, y: np.array):
      ss_res = sum((y - self.predict(X))**2)
      ss_tot = sum((y - np.mean(y))**2)
      return 1 - (ss_res/ss_tot)


Используя датасет про автомобили (целевой признак — price), сравнить (качество, скорость обучения и предсказания, важность признаков) модели `MyLinearRegression` с различными гиперпараметрами, сделать выводы. На этом же датасете сравнить модель `MyLinearRegression` с библиотечной реализацией из `sklearn`, составить таблицу(ы) (графики) результатов сравнения (качество, скорость обучения и предсказания, важность признаков).

## Загрузка и предобработка данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/ML/Used_fiat_500_in_Italy_dataset.csv')
df.head()

Unnamed: 0,model,engine_power,transmission,age_in_days,km,previous_owners,lat,lon,price
0,pop,69,manual,4474,56779,2,45.071079,7.46403,4490
1,lounge,69,manual,2708,160000,1,45.069679,7.70492,4500
2,lounge,69,automatic,3470,170000,2,45.514599,9.28434,4500
3,sport,69,manual,3288,132000,2,41.903221,12.49565,4700
4,sport,69,manual,3712,124490,2,45.532661,9.03892,4790


In [None]:
df.shape

(380, 11)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   model            380 non-null    object 
 1   engine_power     380 non-null    int64  
 2   transmission     380 non-null    object 
 3   age_in_days      380 non-null    int64  
 4   km               380 non-null    int64  
 5   previous_owners  380 non-null    int64  
 6   lat              380 non-null    float64
 7   lon              380 non-null    float64
 8   price            380 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 26.8+ KB


Обработаем нечисловые признаки с помощью `one-hot`.

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,engine_power,age_in_days,km,previous_owners,lat,lon,price,model_pop,model_sport,model_star,transmission_manual
0,69,4474,56779,2,45.071079,7.46403,4490,True,False,False,True
1,69,2708,160000,1,45.069679,7.70492,4500,False,False,False,True
2,69,3470,170000,2,45.514599,9.28434,4500,False,False,False,False
3,69,3288,132000,2,41.903221,12.49565,4700,False,True,False,True
4,69,3712,124490,2,45.532661,9.03892,4790,False,True,False,True


Сформируем обучающую и тестовую выборки.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['price'], axis=1), df['price'], test_size=0.2, random_state=42)

Выполним масштабирование признаков с помощью `StandardScaler`.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

## Модель MyLinearRegression

In [None]:
import time
from sklearn.metrics import mean_absolute_error
results_dict = dict()
importance_of_features = dict()

parameters = [{'name': 'MyLR matrix', 'weight_calc':'matrix'},
              {'name': 'MyLR matrix l2', 'weight_calc': 'matrix', 'regularization': 'l2', 'lambda_2': 10},
              {'name': 'MyLR gd', 'weight_calc':'gd'},
              {'name': 'MyLR gd l1', 'weight_calc':'gd', 'regularization': 'l1', 'lambda_1': 10},
              {'name': 'MyLR gd l2', 'weight_calc':'gd', 'regularization': 'l2', 'lambda_2': 0.1},
              {'name': 'MyLR gd l1l2', 'weight_calc':'gd', 'regularization': 'l1l2', 'lambda_1': 0.1, 'lambda_2': 0.1},
              {'name': 'MyLR sgd', 'weight_calc':'sgd'},
              {'name': 'MyLR sgd l1', 'weight_calc':'sgd', 'regularization': 'l1', 'lambda_1': 1},
              {'name': 'MyLR sgd l2', 'weight_calc':'sgd', 'regularization': 'l2', 'lambda_2': 0.01},
              {'name': 'MyLR sgd l1l2', 'weight_calc':'sgd', 'regularization': 'l1l2', 'lambda_1': 0.1, 'lambda_2': 0.01}]

for param in parameters:
  name_model = param['name']
  args = {k: v for k,v in param.items() if k!='name'}

  start_time_fit = time.time()
  MyLR = MyLinearRegression(**args).fit(X_train, y_train)
  end_time_fit = time.time()

  start_time_pred = time.time()
  y_pred = MyLR.predict(X_test.values)
  end_time_pred = time.time()

  results_dict[name_model] = np.round([MyLR.score(X_test.values, y_test.values), mean_absolute_error(y_test, y_pred), end_time_fit-start_time_fit, end_time_pred-start_time_pred], 4)
  importance_of_features[name_model] = np.round(MyLR.coefs_,3)

## Библиотечныe реализаций из sklearn

Обучение LinearRegression из sklearn

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso, SGDRegressor
from sklearn.metrics import r2_score
import time

start_time_fit = time.time()
sklearn_LR = LinearRegression().fit(X_train, y_train)
end_time_fit = time.time()

start_time_pred = time.time()
y_pred = sklearn_LR.predict(X_test)
end_time_pred = time.time()

results_dict['sklearn LR'] = np.round([r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), end_time_fit-start_time_fit, end_time_pred-start_time_pred], 4)
importance_of_features['sklearn LR'] = np.round(sklearn_LR.coef_, 3)

Обучение Ridge из sklearn

In [None]:
start_time_fit = time.time()
sklearn_Ridge = Ridge(alpha=10, random_state=42).fit(X_train, y_train)
end_time_fit = time.time()

start_time_pred = time.time()
y_pred = sklearn_Ridge.predict(X_test)
end_time_pred = time.time()

results_dict['sklearn Ridge'] = np.round([r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), end_time_fit-start_time_fit, end_time_pred-start_time_pred], 4)
importance_of_features['sklearn Ridge'] = np.round(sklearn_Ridge.coef_, 3)

Обучение Lasso из sklearn

In [None]:
start_time_fit = time.time()
sklearn_Lasso = Lasso(alpha=10, random_state=42).fit(X_train, y_train)
end_time_fit = time.time()

start_time_pred = time.time()
y_pred = sklearn_Lasso.predict(X_test.values)
end_time_pred = time.time()

results_dict['sklearn Lasso'] = np.round([r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), end_time_fit-start_time_fit, end_time_pred-start_time_pred], 4)
importance_of_features['sklearn Lasso'] = np.round(sklearn_Lasso.coef_,3)



Обучение ElasticNet из sklearn

In [None]:
start_time_fit = time.time()
sklearn_ElasticNet = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42).fit(X_train, y_train)
end_time_fit = time.time()

start_time_pred = time.time()
y_pred = sklearn_ElasticNet.predict(X_test)
end_time_pred = time.time()

results_dict['sklearn ElasticNet'] = np.round([r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), end_time_fit-start_time_fit, end_time_pred-start_time_pred], 4)
importance_of_features['sklearn ElasticNet'] = np.round(sklearn_ElasticNet.coef_,3)

Обучение SGDRegressor из sklearn

In [None]:
start_time_fit = time.time()
sklearn_SGDRegressor = SGDRegressor(alpha=0.05, l1_ratio=0.5, random_state=42).fit(X_train, y_train)
end_time_fit = time.time()

start_time_pred = time.time()
y_pred = sklearn_SGDRegressor.predict(X_test)
end_time_pred = time.time()

results_dict['sklearn SGDRegressor'] = np.round([r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), end_time_fit-start_time_fit, end_time_pred-start_time_pred], 4)
importance_of_features['sklearn SGDRegressor'] = np.round(sklearn_SGDRegressor.coef_,3)

## Сравнение результатов

In [None]:
resullts_table = pd.DataFrame.from_dict(results_dict, orient='index', columns=['r2_score', 'MAE', 'train_time', 'pred_time'])
resullts_table

Unnamed: 0,r2_score,MAE,train_time,pred_time
MyLR matrix,0.8945,559.2244,0.0007,0.0
MyLR matrix l2,0.8927,561.8715,0.0004,0.0
MyLR gd,0.8944,559.4996,0.0653,0.0001
MyLR gd l1,0.8935,560.4104,0.0613,0.0001
MyLR gd l2,0.8866,575.2567,0.0454,0.0001
MyLR gd l1l2,0.8866,575.2943,0.0535,0.0
MyLR sgd,0.8943,560.2905,0.7098,0.0
MyLR sgd l1,0.8942,560.3863,0.6028,0.0
MyLR sgd l2,0.8938,560.4657,0.6363,0.0001
MyLR sgd l1l2,0.8938,560.4824,0.7899,0.0


In [None]:
importance_of_features = pd.DataFrame.from_dict(importance_of_features, orient='index', columns=X_train.columns)
importance_of_features

Unnamed: 0,engine_power,age_in_days,km,previous_owners,lat,lon,model_pop,model_sport,model_star,transmission_manual
MyLR matrix,39.824,-1533.838,-556.685,50.413,194.474,252.261,46.635,54.641,35.701,-3.125
MyLR matrix l2,29.641,-1457.151,-586.439,50.083,166.267,222.38,35.296,43.546,40.796,-5.652
MyLR gd,39.804,-1531.712,-558.592,50.498,192.658,250.528,46.389,54.404,35.842,-3.301
MyLR gd l1,33.677,-1528.96,-550.809,43.651,167.664,224.219,37.894,47.221,32.014,-0.093
MyLR gd l2,12.006,-1334.061,-622.277,48.466,125.954,178.223,17.618,25.705,48.746,-8.963
MyLR gd l1l2,11.95,-1334.029,-622.218,48.407,125.787,178.047,17.543,25.641,48.707,-8.909
MyLR sgd,41.29,-1529.793,-556.645,54.422,192.985,248.331,44.609,56.165,33.49,-0.474
MyLR sgd l1,40.687,-1529.506,-555.884,53.76,190.448,245.656,43.778,55.449,33.19,-0.176
MyLR sgd l2,38.075,-1505.131,-566.753,54.341,183.723,238.612,40.956,52.491,35.172,-1.306
MyLR sgd l1l2,38.015,-1505.102,-566.679,54.276,183.482,238.358,40.873,52.42,35.141,-1.275
