# libs import

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.metrics import root_mean_squared_error
from sklearn.inspection import permutation_importance
import shap
from scipy.stats import loguniform, uniform
import optuna
import time

#custom lib
import lm_funcs
from importlib import reload
reload(lm_funcs)
from lm_funcs import FeatureCreating
from lm_funcs import ModelsEval
from lm_funcs import Splits
from lm_funcs import CrossValidation
from lm_funcs import Comparasion
from lm_funcs import FeatureSelection
from lm_funcs import GridSearchCV, RandomSearchCV


  from .autonotebook import tqdm as notebook_tqdm


# Data loading

In [2]:
df_train = pd.read_json(f"data/train.json")

# Feature creation

In [3]:
df_train['interest_level'] = df_train['interest_level'].map({'low': 0, 'medium': 1, 'high': 2})

In [4]:
df_train['features'] = FeatureCreating.column_clearing(df_train['features'])

In [5]:
all_features = []
for index, row in df_train.iterrows():
    features = row['features']
    if len(features) > 0:
        all_features.extend(features.split(","))

In [6]:
print(f"Number of unique values:{len(set(all_features))}")

Number of unique values:1546


In [7]:
print("The 20 most popular feature with counts:")
Counter(all_features).most_common(20)

The 20 most popular feature with counts:


[('Elevator', 25915),
 ('CatsAllowed', 23540),
 ('HardwoodFloors', 23527),
 ('DogsAllowed', 22035),
 ('Doorman', 20898),
 ('Dishwasher', 20426),
 ('NoFee', 18062),
 ('LaundryinBuilding', 16344),
 ('FitnessCenter', 13252),
 ('Pre-War', 9148),
 ('LaundryinUnit', 8738),
 ('RoofDeck', 6542),
 ('OutdoorSpace', 5268),
 ('DiningRoom', 5136),
 ('HighSpeedInternet', 4299),
 ('Balcony', 2992),
 ('SwimmingPool', 2730),
 ('LaundryInBuilding', 2593),
 ('NewConstruction', 2559),
 ('Terrace', 2283)]

In [8]:
top_20_features = [feature for feature, count in Counter(all_features).most_common(20)]

In [9]:
FeatureCreating.columns_creating(top_20_features, df_train)

##### features

In [10]:
feature_list = top_20_features + ['bathrooms', 'bedrooms', 'interest_level']
random_state = 21

# Splits

split on test and train

In [11]:
train, test = Splits().random_split_two(df_train)

In [12]:
print(train.shape)
print(test.shape)

(39482, 35)
(9870, 35)


split on train, validation, test

In [13]:
train, validation, test = Splits().random_split_three(df_train)

In [14]:
print(train.shape)
print(validation.shape)
print(test.shape)

(29612, 35)
(9870, 35)
(9870, 35)


split on date on train and test

In [15]:
train, test = Splits().date_split_two(df_train, date_col= 'created', 
                                      date_split=df_train.loc[1002,'created'])

In [16]:
print(train.shape)
print(test.shape)

(46363, 35)
(2989, 35)


split on date on train, validation and test

In [17]:
train, validation, test = Splits().date_split_three(df_train, date_col= 'created', validation_date = df_train.loc[506,'created'], test_date = df_train.loc[1002,'created'])

In [18]:
print(train.shape)
print(validation.shape)
print(test.shape)

(39553, 35)
(6810, 35)
(2989, 35)


# Cross-validation

### K-Fold

In [19]:
list_indices = CrossValidation().k_fold(df_train)
sk_list_indices = list(KFold().split(df_train))

In [20]:
Comparasion.loop_for_compare_index(list_indices, sk_list_indices)

Unnamed: 0,jaccard,overlap,fold
0,1.0,1.0,1
1,1.0,1.0,2
2,1.0,1.0,3
3,1.0,1.0,4
4,1.0,1.0,5


In [21]:
Comparasion.loop_for_compare_distributions(list_indices, sk_list_indices, df_train)

Unnamed: 0,same_distribution_share,all_same,fold
0,1.0,True,1
1,1.0,True,2
2,1.0,True,3
3,1.0,True,4
4,1.0,True,5


### Grouped K-Fold

In [22]:
list_indices = CrossValidation().grouped_k_fold(df_train, group_field="bedrooms")
sk_list_indices = list(GroupKFold().split(df_train, groups=df_train.bedrooms.values))

In [23]:
Comparasion.loop_for_compare_index(list_indices, sk_list_indices)

Unnamed: 0,jaccard,overlap,fold
0,1.0,1.0,1
1,1.0,1.0,2
2,1.0,1.0,3
3,1.0,1.0,4
4,1.0,1.0,5


In [24]:
Comparasion.loop_for_compare_distributions(list_indices, sk_list_indices, df_train)

Unnamed: 0,same_distribution_share,all_same,fold
0,1.0,True,1
1,1.0,True,2
2,1.0,True,3
3,1.0,True,4
4,1.0,True,5


### Stratified K-Fold

In [25]:
list_indices = CrossValidation().stratified_k_fold(df_train, stratify_field="bedrooms")
sk_list_indices = list(StratifiedKFold().split(df_train, df_train['bedrooms']))



In [26]:
Comparasion.loop_for_compare_index(list_indices, sk_list_indices)

Unnamed: 0,jaccard,overlap,fold
0,0.999899,1.0,1
1,0.999797,0.999924,2
2,0.999823,0.999899,3
3,0.999848,0.999899,4
4,0.999924,0.999924,5


In [27]:
Comparasion.loop_for_compare_distributions(list_indices, sk_list_indices, df_train)

Unnamed: 0,same_distribution_share,all_same,fold
0,1.0,True,1
1,1.0,True,2
2,1.0,True,3
3,1.0,True,4
4,1.0,True,5


### Time series split

In [28]:
list_indices = CrossValidation().time_series_split(df_train, date_field="created")
sk_list_indices = list(TimeSeriesSplit().split(df_train.sort_values(by = 'created')))

In [29]:
Comparasion.loop_for_compare_index(list_indices, sk_list_indices)

Unnamed: 0,jaccard,overlap,fold
0,0.999757,1.0,1
1,0.999878,1.0,2
2,0.999919,1.0,3
3,0.999939,1.0,4
4,0.999951,1.0,5


In [30]:
Comparasion.loop_for_compare_distributions(list_indices, sk_list_indices, df_train)

Unnamed: 0,same_distribution_share,all_same,fold
0,1.0,True,1
1,1.0,True,2
2,1.0,True,3
3,1.0,True,4
4,1.0,True,5


### In this task target variable is continuous, time doesn't matter and there`no groups - so simple K-Fold CV is suitable

# Feature selection

### Split to train, validation and test

In [31]:
df_train[['bathrooms', 'bedrooms']] = df_train[['bathrooms', 'bedrooms']].astype(float)

In [32]:
train, validation, test = Splits().random_split_three(df_train, test_size=0.2, validation_size=0.2, random_state=21)

In [33]:
X_train, y_train = train[feature_list], train["price"]
X_validation, y_validation = validation[feature_list], validation["price"]
X_test, y_test = test[feature_list], test["price"]

In [34]:
scaler = StandardScaler()
X_train.loc[:,['bathrooms', 'bedrooms']] = scaler.fit_transform(X_train[['bathrooms', 'bedrooms']])
X_validation.loc[:,['bathrooms', 'bedrooms']]  = scaler.transform(X_validation[['bathrooms', 'bedrooms']])
X_test.loc[:,['bathrooms', 'bedrooms']] = scaler.transform(X_test[['bathrooms', 'bedrooms']])

### models fiting

In [35]:
models_eval_fs = ModelsEval()

In [36]:
models_eval_fs.fit_model_with_evaluation("on_all_feature", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, 0.0, feature_list)

In [37]:
start_time = time.time()
reg_lasso = Lasso().fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time

lasso_coef = pd.DataFrame({
    'feature':X_train.columns,
    'coef':reg_lasso.coef_})
lasso_coef = lasso_coef.sort_values(by = 'coef', key = lambda x: x.abs(), ascending = False)

In [38]:
lasso_coef.head(10)

Unnamed: 0,feature,coef
4,Doorman,1340.85
20,bathrooms,1054.12
21,bedrooms,683.55
0,Elevator,659.98
17,LaundryInBuilding,-612.75
7,LaundryinBuilding,-582.28
10,LaundryinUnit,527.74
22,interest_level,-506.85
8,FitnessCenter,-435.42
19,Terrace,416.04


In [39]:
models_eval_fs.fit_model_with_evaluation("10_fs_from_lasso", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, lasso_coef.head(10).feature)

In [40]:
start_time = time.time()
selected_10_nan_corr = FeatureSelection.select_features_by_nan_corr(X_train, y_train, n_features=10)
end_time = time.time()
training_time = end_time - start_time

Не удалось набрать 10 признаков при текущих условиях: 
-max_nan_ratio = 0.10; 
-min_corr = 0.10. 
Ослабляем фильтры и повторяем...
Не удалось набрать 10 признаков при текущих условиях: 
-max_nan_ratio = 0.15; 
-min_corr = 0.05. 
Ослабляем фильтры и повторяем...


In [41]:
selected_10_nan_corr

['bedrooms',
 'bathrooms',
 'interest_level',
 'Doorman',
 'LaundryinUnit',
 'DiningRoom',
 'Elevator',
 'Terrace',
 'DogsAllowed',
 'FitnessCenter']

In [42]:
models_eval_fs.fit_model_with_evaluation("10_fs_nan_corr", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, selected_10_nan_corr)

In [43]:
start_time = time.time()
selected_10_permutation = FeatureSelection.permutation_importance(Lasso().fit(X_train, y_train), 
                                                                  X_validation, y_validation, n_features=10,
                                                                  random_state=random_state)
end_time = time.time()
training_time = end_time - start_time

In [44]:
selected_10_permutation

Unnamed: 0,Feature,Importance
0,bathrooms,322.59
1,bedrooms,240.19
2,Doorman,196.2
3,interest_level,48.2
4,LaundryinBuilding,44.55
5,Elevator,39.8
6,LaundryinUnit,32.72
7,FitnessCenter,29.0
8,LaundryInBuilding,11.18
9,DogsAllowed,6.56


In [45]:
pd.DataFrame(permutation_importance(Lasso().fit(X_train, y_train), X_validation, 
                       y_validation, n_repeats=10, scoring='neg_mean_absolute_error')['importances_mean'], index=X_validation.columns, columns = ['score']).sort_values(by='score', ascending=False).head(10)

Unnamed: 0,score
bathrooms,326.59
bedrooms,237.82
Doorman,202.35
interest_level,46.68
LaundryinBuilding,41.32
Elevator,39.31
LaundryinUnit,31.98
FitnessCenter,27.39
LaundryInBuilding,10.67
DogsAllowed,5.8


In [46]:
models_eval_fs.fit_model_with_evaluation("10_fs_permutation", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, selected_10_permutation.Feature)

In [47]:
start_time = time.time()
explainer = shap.LinearExplainer(reg_lasso, X_validation)
shap_values = explainer.shap_values(X_validation)
importance_df = pd.DataFrame({
    'Feature': X_validation.columns,
    'Importance': np.abs(shap_values).mean(axis=0)
})
selected_10_shap = importance_df.sort_values(by='Importance', ascending=False).head(10)
end_time = time.time()
training_time = end_time - start_time


In [48]:
selected_10_shap

Unnamed: 0,Feature,Importance
20,bathrooms,777.6
4,Doorman,653.5
21,bedrooms,576.64
0,Elevator,329.21
22,interest_level,257.8
7,LaundryinBuilding,256.5
3,DogsAllowed,197.3
10,LaundryinUnit,158.98
8,FitnessCenter,158.58
6,NoFee,139.32


In [49]:
models_eval_fs.fit_model_with_evaluation("10_fs_shap", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, selected_10_shap.Feature)

In [50]:
models_eval_fs.show_results()

MAE Results:
               model   train     val    test
0     on_all_feature 1120.47 1212.92 1094.56
1   10_fs_from_lasso 1095.73 1186.13 1063.27
2     10_fs_nan_corr 1088.24 1174.59 1062.05
3  10_fs_permutation 1104.27 1193.03 1075.75
4         10_fs_shap 1109.78 1198.87 1081.11

RMSE Results:
               model    train      val     test
0     on_all_feature 26124.74 15898.46 10869.79
1   10_fs_from_lasso 26126.62 15902.32 10870.91
2     10_fs_nan_corr 26127.95 15905.17 10871.42
3  10_fs_permutation 26126.01 15900.25 10870.76
4         10_fs_shap 26125.89 15900.12 10870.04

R2 Results (in %):
               model  train  val  test
0     on_all_feature   0.48 1.26  2.50
1   10_fs_from_lasso   0.47 1.21  2.48
2     10_fs_nan_corr   0.46 1.17  2.47
3  10_fs_permutation   0.48 1.23  2.48
4         10_fs_shap   0.48 1.23  2.49

Find features timing (in sec):
               model  find_fs_time
0     on_all_feature          0.00
1   10_fs_from_lasso          0.07
2     10_fs_nan_corr   

In [51]:
models_eval_fs.rank_models()

Unnamed: 0,model,total_score,train_quality_score_norm,stability_score_rmse_norm,time_score_norm
0,on_all_feature,1.0,1.0,1.0,1.0
1,10_fs_shap,0.65,0.64,0.43,0.99
2,10_fs_permutation,0.59,0.6,0.8,0.26
3,10_fs_from_lasso,0.54,0.41,0.52,0.91
4,10_fs_nan_corr,0.0,0.0,0.0,0.0


In [52]:
choosing_10 = selected_10_permutation.Feature

# Hyperparameter optimization

In [53]:
models_eval_ho = ModelsEval()

In [54]:
models_eval_ho.fit_model_with_evaluation("base_ElasticNet", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, 0.0, choosing_10, model_class = ElasticNet)

In [55]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
    'l1_ratio': [0.1, 0.25, 0.5, 0.75, 0.9, 1.0]
}

In [56]:
start_time = time.time()
grid = GridSearchCV(ElasticNet(), param_grid, cv=3, minimize=True)
grid.fit(X_train[choosing_10].values, y_train.values)
print("Best params (Grid):", grid.get_best_params())
print("Best score (Grid):", grid.get_best_score())
end_time = time.time()
training_time = end_time - start_time

models_eval_ho.fit_model_with_evaluation("best_GridSCV_49", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, choosing_10, model_class = ElasticNet,
                                         model_params= grid.get_best_params())


Best params (Grid): {'alpha': 0.5, 'l1_ratio': 0.75}
Best score (Grid): 16254.246118521276


In [57]:
param_distributions = {
    'alpha': loguniform(1e-4, 1e2),
    'l1_ratio': uniform(0, 1)
}

In [58]:
start_time = time.time()
rand_search = RandomSearchCV(ElasticNet(), param_distributions, n_iter=49,
                               cv=3, minimize=True)
rand_search.fit(X_train[choosing_10].values, y_train.values)
print("Best params (Random):", rand_search.get_best_params())
print("Best score (Random):", rand_search.get_best_score())
end_time = time.time()
training_time = end_time - start_time

models_eval_ho.fit_model_with_evaluation("best_RandomSCV_49", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, choosing_10, model_class = ElasticNet,
                                         model_params= rand_search.get_best_params())

Best params (Random): {'alpha': np.float64(5.16317), 'l1_ratio': np.float64(0.9666)}
Best score (Random): 16252.482425935224


In [59]:
def objective(trial):
    params = {
    'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log=True),
    'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0)
}

    model = ElasticNet(**params, random_state=random_state)

    model.fit(X_train[choosing_10], y_train)
    y_pred = model.predict(X_validation[choosing_10])

    return root_mean_squared_error(y_validation, y_pred)

In [60]:
optuna.logging.set_verbosity(optuna.logging.WARNING) 

In [61]:
start_time = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=49)
print("Best params:", study.best_params)
print("Best score:", study.best_value)
end_time = time.time()
training_time = end_time - start_time

models_eval_ho.fit_model_with_evaluation("best_optuna_49", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, choosing_10, model_class = ElasticNet,
                                         model_params= study.best_params)

Best params: {'alpha': 0.000290300841751118, 'l1_ratio': 0.9997603177859448}
Best score: 15900.144185227575


In [62]:
def objective_cv(trial):
    params = {
    'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log=True), 
    'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0)
}

    model = ElasticNet(**params, random_state=random_state)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X_train[choosing_10]):
        X_tr, X_val = X_train.values[train_idx], X_train.values[val_idx]
        y_tr, y_val = y_train.values[train_idx], y_train.values[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        score = root_mean_squared_error(y_val, y_pred)
        scores.append(score)

    return np.mean(scores)

In [63]:
start_time = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective_cv, n_trials=49)
print("Best params:", study.best_params)
print("Best score:", study.best_value)
end_time = time.time()
training_time = end_time - start_time

models_eval_ho.fit_model_with_evaluation("best_optunaCV_49", X_train, y_train, X_validation, 
                                         y_validation, X_test, y_test, training_time, choosing_10, model_class = ElasticNet,
                                         model_params= study.best_params)

Best params: {'alpha': 0.2976084511893204, 'l1_ratio': 0.5340617563700224}
Best score: 13092.561397246322


In [64]:
models_eval_ho.show_results()

MAE Results:
               model   train     val    test
0    base_ElasticNet 1030.00 1114.05  988.06
1    best_GridSCV_49 1022.14 1109.79  988.23
2  best_RandomSCV_49 1014.75 1101.81  979.16
3     best_optuna_49 1106.75 1195.56 1078.22
4   best_optunaCV_49 1019.78 1107.29  985.48

RMSE Results:
               model    train      val     test
0    base_ElasticNet 26134.81 15918.57 10883.07
1    best_GridSCV_49 26128.43 15907.68 10871.47
2  best_RandomSCV_49 26129.58 15910.07 10873.06
3     best_optuna_49 26126.01 15900.14 10870.92
4   best_optunaCV_49 26128.69 15908.23 10871.84

R2 Results (in %):
               model  train  val  test
0    base_ElasticNet   0.41 1.01  2.26
1    best_GridSCV_49   0.46 1.14  2.47
2  best_RandomSCV_49   0.45 1.11  2.44
3     best_optuna_49   0.48 1.23  2.48
4   best_optunaCV_49   0.45 1.13  2.46

Find features timing (in sec):
               model  find_fs_time
0    base_ElasticNet          0.00
1    best_GridSCV_49          0.79
2  best_RandomSCV_49   

In [65]:
models_eval_ho.rank_models()

Unnamed: 0,model,total_score,train_quality_score_norm,stability_score_rmse_norm,time_score_norm
0,best_optuna_49,0.77,1.0,0.36,0.83
1,best_GridSCV_49,0.54,0.73,0.0,0.88
2,base_ElasticNet,0.5,0.0,1.0,1.0
3,best_RandomSCV_49,0.49,0.59,0.08,0.82
4,best_optunaCV_49,0.35,0.7,0.02,0.0
