In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#custom lib
import lm_funcs
from importlib import reload
reload(lm_funcs)
from lm_funcs import FeatureCreating
from lm_funcs import ModelsEval
from lm_funcs import S21LinearRegression
from lm_funcs import S21RidgeRegression
from lm_funcs import S21LassoRegression
from lm_funcs import S21ElasticNetRegression
from lm_funcs import S21MinMaxScaler
from lm_funcs import S21StandardScaler

# Data loading

In [2]:
df_train = pd.read_json(f"data/train.json")
df_test = pd.read_json(f"data/test.json")

# Feature creation

##### dont use interest_level in model due to absence in test data

In [3]:
df_train['interest_level'] = df_train['interest_level'].map({'low': 0, 'medium': 1, 'high': 2})

In [4]:
df_train['features']

4         [Dining Room, Pre-War, Laundry in Building, Di...
6         [Doorman, Elevator, Laundry in Building, Dishw...
9         [Doorman, Elevator, Laundry in Building, Laund...
10                                                       []
15        [Doorman, Elevator, Fitness Center, Laundry in...
                                ...                        
124000              [Elevator, Dishwasher, Hardwood Floors]
124002    [Common Outdoor Space, Cats Allowed, Dogs Allo...
124004    [Dining Room, Elevator, Pre-War, Laundry in Bu...
124008    [Pre-War, Laundry in Unit, Dishwasher, No Fee,...
124009    [Dining Room, Elevator, Laundry in Building, D...
Name: features, Length: 49352, dtype: object

In [5]:
df_train['features'] = FeatureCreating.column_clearing(df_train['features'])

In [6]:
df_train['features']

4         DiningRoom,Pre-War,LaundryinBuilding,Dishwashe...
6         Doorman,Elevator,LaundryinBuilding,Dishwasher,...
9         Doorman,Elevator,LaundryinBuilding,LaundryinUn...
10                                                         
15         Doorman,Elevator,FitnessCenter,LaundryinBuilding
                                ...                        
124000                   Elevator,Dishwasher,HardwoodFloors
124002    CommonOutdoorSpace,CatsAllowed,DogsAllowed,NoF...
124004    DiningRoom,Elevator,Pre-War,LaundryinBuilding,...
124008    Pre-War,LaundryinUnit,Dishwasher,NoFee,Outdoor...
124009    DiningRoom,Elevator,LaundryinBuilding,Dishwash...
Name: features, Length: 49352, dtype: object

In [7]:
all_features = []
for index, row in df_train.iterrows():
    features = row['features']
    if len(features) > 0:
        all_features.extend(features.split(","))

In [8]:
print(f"Number of unique values:{len(set(all_features))}")

Number of unique values:1546


In [9]:
print("The 20 most popular feature with counts:")
Counter(all_features).most_common(20)

The 20 most popular feature with counts:


[('Elevator', 25915),
 ('CatsAllowed', 23540),
 ('HardwoodFloors', 23527),
 ('DogsAllowed', 22035),
 ('Doorman', 20898),
 ('Dishwasher', 20426),
 ('NoFee', 18062),
 ('LaundryinBuilding', 16344),
 ('FitnessCenter', 13252),
 ('Pre-War', 9148),
 ('LaundryinUnit', 8738),
 ('RoofDeck', 6542),
 ('OutdoorSpace', 5268),
 ('DiningRoom', 5136),
 ('HighSpeedInternet', 4299),
 ('Balcony', 2992),
 ('SwimmingPool', 2730),
 ('LaundryInBuilding', 2593),
 ('NewConstruction', 2559),
 ('Terrace', 2283)]

In [10]:
top_20_features = [feature for feature, count in Counter(all_features).most_common(20)]

In [11]:
FeatureCreating.columns_creating(top_20_features, df_train)

##### repeat same for test part

In [12]:
df_test['features'] = FeatureCreating.column_clearing(df_test['features'])
FeatureCreating.columns_creating(top_20_features, df_test)

##### split X and y with choosing features for X

In [13]:
feature_list = top_20_features + ['bathrooms', 'bedrooms']
X_train = df_train[feature_list]
y_train = df_train[['price']]
X_test = df_test[feature_list]
y_test = df_test[['price']]

# Models implementation — Linear regression

##### initializing class with evaluation dfs

In [14]:
models_eval = ModelsEval()

##### fit models

In [15]:
reg = S21LinearRegression().fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_linear_regression_batch", y_train, reg.predict(X_train), y_test, reg.predict(X_test))

In [16]:
reg = S21LinearRegression(optimizer=lm_funcs.BGDOptimizer()).fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_linear_regression_sgd", y_train, reg.predict(X_train), y_test, reg.predict(X_test))

In [17]:
reg = S21LinearRegression(optimizer=lm_funcs.NormalEquationOptimizer()).fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_linear_regression_normal", y_train, reg.predict(X_train), y_test, reg.predict(X_test))

In [18]:
reg = LinearRegression().fit(X_train, y_train)
models_eval.insert_eval_in_DF("linear_regression", y_train, reg.predict(X_train), y_test, reg.predict(X_test))

In [19]:
models_eval.show_results()

MAE Results:
                          model  train   test
0   s21_linear_regression_batch 1167.7 1092.1
1     s21_linear_regression_sgd 1167.7 1092.1
2  s21_linear_regression_normal 1155.1 1085.6
3             linear_regression 1155.1 1085.6

RMSE Results:
                          model   train   test
0   s21_linear_regression_batch 22034.7 9644.6
1     s21_linear_regression_sgd 22034.7 9644.6
2  s21_linear_regression_normal 21995.1 9618.8
3             linear_regression 21995.1 9618.8

R2 Results (in %):
                          model  train  test
0   s21_linear_regression_batch    0.3   1.4
1     s21_linear_regression_sgd    0.3   1.4
2  s21_linear_regression_normal    0.6   1.9
3             linear_regression    0.6   1.9


# Regularized models implementation — Ridge, Lasso, ElasticNet

In [20]:
reg = S21RidgeRegression().fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_ridge", y_train, reg.predict(X_train), y_test, reg.predict(X_test))
reg = S21RidgeRegression(optimizer=lm_funcs.NormalEquationOptimizer()).fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_ridge_normal", y_train, reg.predict(X_train), y_test, reg.predict(X_test))
reg = S21LassoRegression().fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_lasso", y_train, reg.predict(X_train), y_test, reg.predict(X_test))
reg = S21ElasticNetRegression().fit(X_train, y_train)
models_eval.insert_eval_in_DF("s21_elastic_net", y_train, reg.predict(X_train), y_test, reg.predict(X_test))

In [21]:
reg = Ridge().fit(X_train, y_train)
models_eval.insert_eval_in_DF("ridge", y_train, reg.predict(X_train), y_test, reg.predict(X_test))
reg = Lasso().fit(X_train, y_train)
models_eval.insert_eval_in_DF("lasso", y_train, reg.predict(X_train), y_test, reg.predict(X_test))
reg = ElasticNet().fit(X_train, y_train)
models_eval.insert_eval_in_DF("elastic_net", y_train, reg.predict(X_train), y_test, reg.predict(X_test))

In [22]:
models_eval.show_results()

MAE Results:
                           model  train   test
0    s21_linear_regression_batch 1167.7 1092.1
1      s21_linear_regression_sgd 1167.7 1092.1
2   s21_linear_regression_normal 1155.1 1085.6
3              linear_regression 1155.1 1085.6
4                      s21_ridge 1222.9 1147.2
5               s21_ridge_normal 1155.1 1085.5
6                      s21_lasso 1167.8 1092.2
7                s21_elastic_net 1192.4 1116.8
8                          ridge 1155.1 1085.5
9                          lasso 1151.4 1081.8
10                   elastic_net 1121.6 1048.5

RMSE Results:
                           model   train   test
0    s21_linear_regression_batch 22034.7 9644.6
1      s21_linear_regression_sgd 22034.7 9644.6
2   s21_linear_regression_normal 21995.1 9618.8
3              linear_regression 21995.1 9618.8
4                      s21_ridge 22043.2 9661.5
5               s21_ridge_normal 21995.1 9618.8
6                      s21_lasso 22034.7 9644.7
7                s21_ela

# Feature normalization

In [23]:
scaler = S21MinMaxScaler()
X_train_s21minmax = scaler.fit_transform(X_train)
X_test_s21minmax = scaler.transform(X_test)

scaler = S21StandardScaler()
X_train_s21standard = scaler.fit_transform(X_train)
X_test_s21standard = scaler.transform(X_test)

In [24]:
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_test_minmax = scaler.transform(X_test)

scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_test_standard = scaler.transform(X_test)

In [25]:
print("Whether scikit learn identical to custom classes of normalization?")
if all([np.array_equal(X_train_minmax, X_train_s21minmax), 
np.array_equal(X_test_s21minmax, X_test_minmax),
np.array_equal(X_train_standard, X_train_s21standard),
np.array_equal(X_test_standard, X_test_s21standard)]) == 1:
    print("Yes")
else:
    print("No")

Whether scikit learn identical to custom classes of normalization?
Yes


# Fit models with normalization

In [26]:
models_eval.run_models(X_train_minmax,X_test_minmax, y_train, y_test, "minmax")
models_eval.run_models(X_train_standard,X_test_standard, y_train, y_test, "standard")

In [27]:
models_eval.show_results()

MAE Results:
                           model  train   test
0    s21_linear_regression_batch 1167.7 1092.1
1      s21_linear_regression_sgd 1167.7 1092.1
2   s21_linear_regression_normal 1155.1 1085.6
3              linear_regression 1155.1 1085.6
4                      s21_ridge 1222.9 1147.2
5               s21_ridge_normal 1155.1 1085.5
6                      s21_lasso 1167.8 1092.2
7                s21_elastic_net 1192.4 1116.8
8                          ridge 1155.1 1085.5
9                          lasso 1151.4 1081.8
10                   elastic_net 1121.6 1048.5
11      linear_regression_minmax 1155.1 1085.6
12                  ridge_minmax 1155.0 1085.4
13                  lasso_minmax 1150.7 1080.9
14            elastic_net_minmax 1447.1 1368.4
15    linear_regression_standard 1155.1 1085.6
16                ridge_standard 1155.1 1085.6
17                lasso_standard 1153.5 1083.9
18          elastic_net_standard 1074.1 1001.2

RMSE Results:
                           model

# Overfit models

In [28]:
poly = PolynomialFeatures(10)
X_train_poly = poly.fit_transform(X_train[['bathrooms', 'bedrooms']])
X_test_poly = poly.transform(X_test[['bathrooms', 'bedrooms']])

In [29]:
models_eval.run_models(X_train_poly,X_test_poly, y_train, y_test, "poly")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [30]:
best_model_info = models_eval.find_best_alpha(X_train_poly,X_test_poly, y_train, y_test)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [31]:
print(best_model_info)

{'model_name': 'ElasticNet', 'alpha': 1.0, 'model': ElasticNet(), 'rmse': 12859307559963.525, 'mae': 47062669282.27575, 'r2': -1.7527743710103035e+18}


In [32]:
models_eval.show_results()

MAE Results:
                           model  train                test
0    s21_linear_regression_batch 1167.7              1092.1
1      s21_linear_regression_sgd 1167.7              1092.1
2   s21_linear_regression_normal 1155.1              1085.6
3              linear_regression 1155.1              1085.6
4                      s21_ridge 1222.9              1147.2
5               s21_ridge_normal 1155.1              1085.5
6                      s21_lasso 1167.8              1092.2
7                s21_elastic_net 1192.4              1116.8
8                          ridge 1155.1              1085.5
9                          lasso 1151.4              1081.8
10                   elastic_net 1121.6              1048.5
11      linear_regression_minmax 1155.1              1085.6
12                  ridge_minmax 1155.0              1085.4
13                  lasso_minmax 1150.7              1080.9
14            elastic_net_minmax 1447.1              1368.4
15    linear_regression_sta

# Naive models

In [33]:
models_eval.insert_eval_in_DF("naive_mean", y_train, np.full(len(y_train), np.mean(y_train)), y_test, np.full(len(y_test), np.mean(y_test)))
models_eval.insert_eval_in_DF("naive_median", y_train, np.full(len(y_train), np.median(y_train)), y_test, np.full(len(y_test), np.median(y_test)))

# Compare results

In [34]:
models_eval.show_results()

MAE Results:
                           model  train                test
0    s21_linear_regression_batch 1167.7              1092.1
1      s21_linear_regression_sgd 1167.7              1092.1
2   s21_linear_regression_normal 1155.1              1085.6
3              linear_regression 1155.1              1085.6
4                      s21_ridge 1222.9              1147.2
5               s21_ridge_normal 1155.1              1085.5
6                      s21_lasso 1167.8              1092.2
7                s21_elastic_net 1192.4              1116.8
8                          ridge 1155.1              1085.5
9                          lasso 1151.4              1081.8
10                   elastic_net 1121.6              1048.5
11      linear_regression_minmax 1155.1              1085.6
12                  ridge_minmax 1155.0              1085.4
13                  lasso_minmax 1150.7              1080.9
14            elastic_net_minmax 1447.1              1368.4
15    linear_regression_sta

In [35]:
models_eval.show_best_models()

MAE Results:
                   model  train   test  diff
18  elastic_net_standard 1074.1 1001.2  72.9
10           elastic_net 1121.6 1048.5  73.1
13          lasso_minmax 1150.7 1080.9  69.8
9                  lasso 1151.4 1081.8  69.7
17        lasso_standard 1153.5 1083.9  69.6

RMSE Results:
                   model   train   test    diff
18  elastic_net_standard 22001.2 9600.4 12400.8
10           elastic_net 22015.3 9611.4 12403.9
13          lasso_minmax 21995.1 9616.9 12378.1
12          ridge_minmax 21995.1 9617.9 12377.2
9                  lasso 21995.1 9618.6 12376.5

R2 Results (in %):
                   model  train  test  diff
18  elastic_net_standard    0.6   2.3   1.7
10           elastic_net    0.5   2.1   1.6
13          lasso_minmax    0.6   2.0   1.3
12          ridge_minmax    0.6   1.9   1.3
9                  lasso    0.6   1.9   1.3


Одновременно и лучшая модель и наболее стабильная - это elasticnet на стандартизированных данных

# Addition task

## logs

In [36]:
y_train_log = np.log(y_train)
type = "log"
reg = LinearRegression().fit(X_train, y_train_log)
models_eval.insert_eval_in_DF("linear_regression_"+type, y_train, np.exp(reg.predict(X_train)), y_test, np.exp(reg.predict(X_test)))
reg = Ridge().fit(X_train, y_train_log)
models_eval.insert_eval_in_DF("ridge_"+type, y_train, np.exp(reg.predict(X_train)), y_test, np.exp(reg.predict(X_test)))
reg = Lasso().fit(X_train, y_train_log)
models_eval.insert_eval_in_DF("lasso_"+type, y_train, np.exp(reg.predict(X_train)), y_test, np.exp(reg.predict(X_test)))
reg = ElasticNet().fit(X_train, y_train_log)
models_eval.insert_eval_in_DF("elastic_net_"+type, y_train, np.exp(reg.predict(X_train)), y_test, np.exp(reg.predict(X_test)))

In [37]:
models_eval.show_results()

MAE Results:
                           model  train                test
0    s21_linear_regression_batch 1167.7              1092.1
1      s21_linear_regression_sgd 1167.7              1092.1
2   s21_linear_regression_normal 1155.1              1085.6
3              linear_regression 1155.1              1085.6
4                      s21_ridge 1222.9              1147.2
5               s21_ridge_normal 1155.1              1085.5
6                      s21_lasso 1167.8              1092.2
7                s21_elastic_net 1192.4              1116.8
8                          ridge 1155.1              1085.5
9                          lasso 1151.4              1081.8
10                   elastic_net 1121.6              1048.5
11      linear_regression_minmax 1155.1              1085.6
12                  ridge_minmax 1155.0              1085.4
13                  lasso_minmax 1150.7              1080.9
14            elastic_net_minmax 1447.1              1368.4
15    linear_regression_sta

Log преобразование улучшает фит модели на тренировочных данных, но дает сильную ошибку на тесте (что, вероятно, связано с неучетом выбросов), что делает ее нестабильной

## outlires

In [38]:
df_train_clean = FeatureCreating().remove_outliers_iqr(df_train, "price")
X_train_clean = df_train_clean[feature_list]
y_train_clean = df_train_clean[['price']]

In [39]:
models_eval.insert_eval_in_DF("naive_mean_clean", y_train_clean, np.full(len(y_train_clean), np.mean(y_train_clean)), y_test, np.full(len(y_test), np.mean(y_test)))
models_eval.insert_eval_in_DF("naive_median_clean", y_train_clean, np.full(len(y_train_clean), np.median(y_train_clean)), y_test, np.full(len(y_test), np.median(y_test)))

In [40]:
models_eval.run_models(X_train_clean, X_test, y_train_clean, y_test, "clean")

In [41]:
models_eval.show_results()

MAE Results:
                           model  train                test
0    s21_linear_regression_batch 1167.7              1092.1
1      s21_linear_regression_sgd 1167.7              1092.1
2   s21_linear_regression_normal 1155.1              1085.6
3              linear_regression 1155.1              1085.6
4                      s21_ridge 1222.9              1147.2
5               s21_ridge_normal 1155.1              1085.5
6                      s21_lasso 1167.8              1092.2
7                s21_elastic_net 1192.4              1116.8
8                          ridge 1155.1              1085.5
9                          lasso 1151.4              1081.8
10                   elastic_net 1121.6              1048.5
11      linear_regression_minmax 1155.1              1085.6
12                  ridge_minmax 1155.0              1085.4
13                  lasso_minmax 1150.7              1080.9
14            elastic_net_minmax 1447.1              1368.4
15    linear_regression_sta

In [42]:
models_eval.show_best_models()

MAE Results:
                      model  train   test  diff
32  linear_regression_clean  595.2  932.2 337.0
33              ridge_clean  595.2  932.2 337.0
34              lasso_clean  595.4  933.3 337.8
18     elastic_net_standard 1074.1 1001.2  72.9
10              elastic_net 1121.6 1048.5  73.1

RMSE Results:
                      model   train   test    diff
18     elastic_net_standard 22001.2 9600.4 12400.8
10              elastic_net 22015.3 9611.4 12403.9
13             lasso_minmax 21995.1 9616.9 12378.1
32  linear_regression_clean   782.9 9617.3  8834.4
33              ridge_clean   782.9 9617.3  8834.4

R2 Results (in %):
                      model  train  test  diff
18     elastic_net_standard    0.6   2.3   1.7
10              elastic_net    0.5   2.1   1.6
13             lasso_minmax    0.6   2.0   1.3
32  linear_regression_clean   52.1   2.0  50.1
33              ridge_clean   52.1   2.0  50.1


Очистка данных от выбросов - даже только на tain значительно улучшает качество фита, но генерализующая способность остается на том же уровне. Для достижения лучшего эффекта стоит применить трансформацию и на тестовые данные  
В результате лучший результат на train (в терминах MAE и $R^2$) и большую стабильность в терминах MAE и RMSE показывают модели с удалением выбросов на train