## Задача 1

Реализовать класс для работы с линейной регрессией

Используя датасет про автомобили (целевой признак — price), сравнить (качество, скорость обучения и предсказания, важность признаков) модели `MyLinearRegression` с различными гиперпараметрами, сделать выводы. На этом же датасете сравнить модель `MyLinearRegression` с библиотечной реализацией из `sklearn`, составить таблицу(ы) (графики) результатов сравнения (качество, скорость обучения и предсказания, важность признаков).

In [20]:
import pandas as pd
import numpy as np
import warnings

class MyLinearRegression:
    """
    Parameters
    ----------
    regularization : {None, 'l1', 'l2', 'l1l2'}, default=None
        Какую регуляризацию добавить к модели. Если значение `None`, то без регуляризации.

    weight_calc : {'matrix', 'gd', 'sgd'}, default='matrix'
        Каким образом вычислять вектор весов: матрично ('matrix'), градиентным спуском ('gd') или стохастическим градиентным спуском ('sgd'). При этом, при 'l1' или 'l1l2' нельзя использовать параметр 'matrix'.

    Attributes
    ----------
    coefs_ : Вектор коэффициентов размера (p, 1), где p — количество признаков.
    intercept_ : Значение коэффициента, отвечающего за смещение
    """

    def __init__(self, regularization=None, weight_calc='matrix', lambda_1=None, lambda_2=None, batch_size=20):
        if regularization not in [None, 'l1', 'l2', 'l1l2']:
            raise TypeError(f"Параметр regularization не может принимать значение '{regularization}'")
        if weight_calc not in ['matrix', 'gd', 'sgd']:
            raise TypeError(f"Параметр weight_calc не может принимать значение '{weight_calc}'")
        if regularization in ['l1', 'l1l2'] and lambda_1 is None:
            raise TypeError(f"Значение коэффициента регуляризации l1 не задано")
        if regularization in ['l2', 'l1l2'] and lambda_2 is None:
            raise TypeError(f"Значение коэффициента регуляризации l2 не задано")
        if regularization in ['l1', 'l1l2'] and weight_calc == 'matrix':
            raise TypeError("При 'l1' или 'l1l2' нельзя использовать параметр 'matrix'")

        self.regularization = regularization
        self.weight_calc = weight_calc
        self.lambda_1 = lambda_1 or 0
        self.lambda_2 = lambda_2 or 0
        self.batch_size = batch_size
        self.coefs_ = None
        self.intercept_ = None
        self.X_mean = None

    def fit(self, X, y):
        X = X.values
        y = y.values.ravel()

        self.X_mean = np.mean(X, axis=0)
        X_centered = X - self.X_mean

        X_with_intercept = np.hstack([np.ones((X.shape[0], 1)), X_centered])


        if self.weight_calc == 'matrix':
            if self.regularization == 'l2':
                n_features = X.shape[1]
                lambda_mat = self.lambda_2 * np.eye(n_features + 1)
                lambda_mat[0, 0] = 0
                theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept + lambda_mat) @ X_with_intercept.T @ y

            else:
                theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y


        elif self.weight_calc == 'gd':
            theta = self._gradient_descent(X_with_intercept, y, stochastic=False)


        elif self.weight_calc == 'sgd':
            theta = self._gradient_descent(X_with_intercept, y, stochastic=True)


        self.intercept_ = theta[0]
        self.coefs_ = theta[1:]

        return self

    def _gradient_descent(self, X, y, stochastic=False, learning_rate=0.01, max_iter=10_000, tol=1e-6):
        n_samples, n_features = X.shape
        theta = np.zeros(n_features)

        for i in range(max_iter):
            prev_theta = theta.copy()

            if stochastic:
                indices = np.random.choice(n_samples, min(self.batch_size, n_samples), replace=False)
                X_batch = X[indices]
                y_batch = y[indices]


            else:
                X_batch = X
                y_batch = y


            y_pred = X_batch @ theta

            grad = (1/len(y_batch)) * X_batch.T @ (y_pred - y_batch)

            if self.regularization == 'l1':
                grad[1:] += self.lambda_1 * np.sign(theta[1:])

            elif self.regularization == 'l2':
                grad[1:] += self.lambda_2 * theta[1:]

            elif self.regularization == 'l1l2':
                grad[1:] += self.lambda_1 * np.sign(theta[1:]) + self.lambda_2 * theta[1:]


            theta -= learning_rate * grad


            if np.linalg.norm(theta - prev_theta) < tol:
                print(f"Градиентный спуск сошелся за {i+1} итераций")
                break


        return theta


    def predict(self, X, ss=True):

        if ss and self.X_mean is not None:
            X_scaled = X - self.X_mean

        else:
            X_scaled = X

        predictions = self.intercept_ + X_scaled @ self.coefs_

        return predictions


    def score(self, X, y):
        y_pred = self.predict(X)
        ss_tot = np.sum((y - np.mean(y))**2)
        ss_res = np.sum((y - y_pred)**2)
        r2 = 1 - ss_res / ss_tot

        return r2


In [38]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import seaborn as sns


df = pd.read_csv('/content/drive/MyDrive/Учеба/Мага/Мат стат и машобуч (Бойцев+Волчек)/Лаба 4/Used_fiat_500_in_Italy_dataset.csv')
print(df.head())

X = df.drop('price', axis=1).select_dtypes(include=[np.number]).fillna(0)
y = df['price'].fillna(df['price'].mean())
print(X.head())


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


configs = [
    {'name': 'Matrix (no reg)', 'regularization': None, 'weight_calc': 'matrix'},
    {'name': 'Matrix L2', 'regularization': 'l2', 'weight_calc': 'matrix', 'lambda_2': 0.1},
    {'name': 'GD (no reg)', 'regularization': None, 'weight_calc': 'gd'},
    {'name': 'SGD (no reg)', 'regularization': None, 'weight_calc': 'sgd', 'batch_size': 32},
    {'name': 'L2 GD (λ=0.1)', 'regularization': 'l2', 'weight_calc': 'gd', 'lambda_2': 0.1},
    {'name': 'L1 GD (λ=0.1)', 'regularization': 'l1', 'weight_calc': 'gd', 'lambda_1': 0.1},
    {'name': 'L1L2 GD (λ=0.1)', 'regularization': 'l1l2', 'weight_calc': 'gd', 'lambda_1': 0.1, 'lambda_2': 0.1},
]

results = []


print("Тест самописной регрессии")
for config in configs:
    print(f"\nТестируем: {config['name']}")

    model_params = {k: v for k, v in config.items() if k != 'name'}

    model = MyLinearRegression(**model_params)
    start_time = time.time()
    model.fit(pd.DataFrame(X_train_scaled), pd.DataFrame(y_train))
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred_train = model.predict(X_train_scaled)
    train_pred_time = time.time() - start_time

    start_time = time.time()
    y_pred_test = model.predict(X_test_scaled)
    test_pred_time = time.time() - start_time

    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    feature_importance = np.abs(model.coefs_)

    results.append({
        'Параметры': config['name'],
        'Train_R2': train_r2,
        'Test_R2': test_r2,
        'Время обучения': train_time,
        'Время на предикт трейна': train_pred_time,
        'Время на предикт теста': test_pred_time,
        'Коэффициенты модели': feature_importance,
        'Type': 'MyLinearRegression'
    })


print("Тестирование sklearn")
sklearn_configs = [
    {'name': 'sklearn LinearRegression', 'model': LinearRegression()},
    {'name': 'sklearn Ridge (α=0.1)', 'model': Ridge(alpha=0.1)},
    {'name': 'sklearn Lasso (α=0.1)', 'model': Lasso(alpha=0.1)},
    {'name': 'sklearn ElasticNet (α=0.1)', 'model': ElasticNet(alpha=0.1, l1_ratio=0.5)},
]

for config in sklearn_configs:
    print(f"\nТестируем: {config['name']}")

    model = config['model']
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred_train = model.predict(X_train_scaled)
    train_pred_time = time.time() - start_time

    start_time = time.time()
    y_pred_test = model.predict(X_test_scaled)
    test_pred_time = time.time() - start_time

    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    feature_importance = np.abs(model.coef_)

    results.append({
        'Параметры': config['name'],
        'Train_R2': train_r2,
        'Test_R2': test_r2,
        'Время обучения': train_time,
        'Время на предикт трейна': train_pred_time,
        'Время на предикт теста': test_pred_time,
        'Коэффициенты модели': feature_importance,
        'Type': 'MyLinearRegression'
    })


results_df = pd.DataFrame(results)


    model  engine_power transmission  age_in_days      km  previous_owners  \
0     pop            69       manual         4474   56779                2   
1  lounge            69       manual         2708  160000                1   
2  lounge            69    automatic         3470  170000                2   
3   sport            69       manual         3288  132000                2   
4   sport            69       manual         3712  124490                2   

         lat       lon  price  
0  45.071079   7.46403   4490  
1  45.069679   7.70492   4500  
2  45.514599   9.28434   4500  
3  41.903221  12.49565   4700  
4  45.532661   9.03892   4790  
   engine_power  age_in_days      km  previous_owners        lat       lon
0            69         4474   56779                2  45.071079   7.46403
1            69         2708  160000                1  45.069679   7.70492
2            69         3470  170000                2  45.514599   9.28434
3            69         3288  132000   

In [23]:
results_df

Unnamed: 0,Параметры,Train_R2,Test_R2,Время обучения,Время на предикт трейна,Время на предикт теста,Коэффициенты модели,Type
0,Matrix (no reg),0.83882,0.921042,0.002985,3.6e-05,1.8e-05,"[56.19383222631059, 1640.008221500234, 505.360...",MyLinearRegression
1,Matrix L2,0.83882,0.9211,0.002938,3.5e-05,1.8e-05,"[56.04008727366819, 1638.9945349651439, 505.92...",MyLinearRegression
2,GD (no reg),0.83882,0.921042,0.297851,7.1e-05,2e-05,"[56.19381558348828, 1640.007962084021, 505.360...",MyLinearRegression
3,SGD (no reg),0.838434,0.918029,1.043785,4.3e-05,2.1e-05,"[61.47934615195239, 1655.531874737705, 526.149...",MyLinearRegression
4,L2 GD (λ=0.1),0.833016,0.92683,0.190999,7e-05,2.1e-05,"[18.383602619958925, 1408.815820643983, 608.91...",MyLinearRegression
5,L1 GD (λ=0.1),0.83882,0.921038,0.334919,6.7e-05,2e-05,"[56.066952454227334, 1640.0156952940038, 505.1...",MyLinearRegression
6,L1L2 GD (λ=0.1),0.833005,0.926821,0.225048,5e-05,2e-05,"[18.26778115523371, 1408.7806961277747, 608.78...",MyLinearRegression
7,sklearn LinearRegression,0.83882,0.921042,0.006079,0.000636,0.000415,"[56.19383222631124, 1640.008221500235, 505.360...",MyLinearRegression
8,sklearn Ridge (α=0.1),0.83882,0.9211,0.004076,0.000517,0.000288,"[56.04008727366892, 1638.994534965148, 505.920...",MyLinearRegression
9,sklearn Lasso (α=0.1),0.83882,0.921038,0.00351,0.000575,0.000353,"[56.068334320458405, 1640.0250284716735, 505.1...",MyLinearRegression


In [37]:
feature_names = X.columns

for idx in results_df.index:
    model_name = results_df.loc[idx, 'Параметры']
    importance = results_df.loc[idx, 'Коэффициенты модели']
    top_features = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)

    print(f"\n{model_name}:")
    print(top_features.to_string(index=False))


Matrix (no reg):
        Feature  Importance
    age_in_days 1640.008222
             km  505.360312
            lon  268.177163
            lat  227.507194
   engine_power   56.193832
previous_owners   26.413148

Matrix L2:
        Feature  Importance
    age_in_days 1509.909292
             km  570.068756
            lon  233.117950
            lat  187.618548
   engine_power   35.559165
previous_owners   24.449981

GD (no reg):
        Feature  Importance
    age_in_days 1640.007962
             km  505.360590
            lon  268.177483
            lat  227.507481
   engine_power   56.193816
previous_owners   26.413198

SGD (no reg):
        Feature  Importance
    age_in_days 1657.785194
             km  501.584829
            lon  244.682955
            lat  216.738443
   engine_power   48.933028
previous_owners   12.759790

L2 GD (λ=0.1):
        Feature  Importance
    age_in_days 1408.815821
             km  608.914721
            lon  204.364085
            lat  156.758816
p

## Задача 2

[Соревнование на Kaggle](https://kaggle.com/competitions/yadro-regression-2025)