# Regresja

In [2]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [3]:
cleaned_data = pd.read_excel('clean_data_relevant.xlsx')

In [4]:
from sklearn.model_selection import train_test_split

X = cleaned_data.drop(columns='PRICE')
y = cleaned_data['PRICE']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
trained_models = []

## Modele

In [6]:
linear = LinearRegression()

param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [3, 4, 10,15,20,50,80],
    'positive': [True, False]
}

 
grid_search = GridSearchCV(estimator=linear, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error',  verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=True; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=True; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=True; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=True; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=True; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=False; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=False; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=False; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=False; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=3, positive=False; total time=   0.0s
[CV] END copy_X=True, fit_intercept=True, n_jobs=4, positive=True; total 

In [7]:
xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [25, 50, 75]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters found:  {'n_estimators': 25}
R-squared:  0.6105831365689202


In [8]:

xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [ 3, 5, 10]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_xgb_model_2 = grid_search.best_estimator_
y_pred = best_xgb_model_2.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters found:  {'max_depth': 3, 'n_estimators': 50}
R-squared:  0.6573922792540261


In [9]:
xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [ 3, 5, 10],
    'learning_rate': [0.05, 0.1, 0.15]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model_3 = grid_search.best_estimator_
y_pred = best_xgb_model_3.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters found:  {'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 50}
R-squared:  0.6901093033929724


In [10]:
xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [ 3, 5, 10],
    'learning_rate': [0.1, 0.15],
    'max_leaves':[25, 50, 200,0]
}

 
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model_4 = grid_search.best_estimator_
y_pred = best_xgb_model_4.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found:  {'learning_rate': 0.15, 'max_depth': 5, 'max_leaves': 50, 'n_estimators': 50}
R-squared:  0.6901093033929724


In [11]:
xgb_model = XGBRegressor()

param_grid = {
     'n_estimators': [25, 50, 75],
    'max_depth': [ 3, 5, 10],
    'learning_rate': [0.1, 0.15],
    'max_leaves':[25, 50, 200,0],
    'tree_method': ['exact', 'approx', 'hist']
}

 
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model_5 = grid_search.best_estimator_
y_pred = best_xgb_model_5.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters found:  {'learning_rate': 0.15, 'max_depth': 3, 'max_leaves': 25, 'n_estimators': 75, 'tree_method': 'exact'}
R-squared:  0.6438331498084009


In [12]:
xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [ 5,8, 10],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_leaves':[25, 50,200,0],
    'tree_method': ['exact', 'approx', 'hist'], 
    'gamma': [0, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_xgb_model_6 = grid_search.best_estimator_
y_pred = best_xgb_model_6.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
[CV] END ....................................n_estimators=25; total time=   0.2s
[CV] END .......................max_depth=3, n_estimators=25; total time=   0.0s
[CV] END .......................max_depth=3, n_estimators=75; total time=   0.1s
[CV] END ......................max_depth=10, n_estimators=25; total time=   0.9s
[CV] END ...learning_rate=0.05, max_depth=3, n_estimators=50; total time=   0.0s
[CV] END ...learning_rate=0.05, max_depth=5, n_estimators=25; total time=   0.1s
[CV] END ...learning_rate=0.05, max_depth=5, n_estimators=50; total time=   0.2s
[CV] END ..learning_rate=0.05, max_depth=10, n_estimators=50; total time=   2.0s
[CV] END ..learning_rate=0.05, max_depth=10, n_estimators=50; total time=   1.4s
[CV] END ..learning_rate=0.15, max_depth=10, n_estimators=75; total time=   1.9s
[CV] END learning_rate=0.1, max_depth=3, max_leaves=25, n_estimators=75; total time=   0.1s
[CV] END learning_rate=0.1, max_de

In [13]:
xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [ 5,8, 10],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_leaves':[25, 50,200,0],
    'tree_method': ['exact', 'approx', 'hist'], 
    'gamma': [0.1, 0.2]
}

grid_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=200)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model_7 = grid_search.best_estimator_
y_pred = best_xgb_model_7.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END gamma=0.1, learning_rate=0.1, max_depth=8, max_leaves=0, n_estimators=75, tree_method=hist; total time=   1.3s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=25, n_estimators=25, tree_method=hist; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=25, n_estimators=50, tree_method=exact; total time=   1.2s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=50, n_estimators=25, tree_method=approx; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=50, n_estimators=25, tree_method=hist; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=50, n_estimators=50, tree_method=approx; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=50, n_estimators=75, tree_method=exact; total time=   1.7s
[CV] END gamma=0.1, learning_rate=0.1, max_depth=10, max_leaves=200, n_estimat

In [15]:
xgb_model = XGBRegressor()

param_grid = {
    'n_estimators': [1000, 750, 3000],
    'max_depth': [ 5],
    'learning_rate': [0.05, 0.1],
    'max_leaves':[0],
    'tree_method': [ 'approx']
}

grid_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=200)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model_7 = grid_search.best_estimator_
y_pred = best_xgb_model_7.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)



Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=25, n_estimators=25, tree_method=approx; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=25, n_estimators=25, tree_method=approx; total time=   0.1s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=25, n_estimators=50, tree_method=exact; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=25, n_estimators=50, tree_method=approx; total time=   0.3s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=25, n_estimators=75, tree_method=exact; total time=   0.7s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=50, n_estimators=25, tree_method=exact; total time=   0.2s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=50, n_estimators=50, tree_method=exact; total time=   0.4s
[CV] END gamma=0.1, learning_rate=0.15, max_depth=5, max_leaves=50, n_estima

In [None]:
xgb_model = XGBRegressor()

param_grid = {
    'max_depth':[3,4, 6],
    'learning_rate':[0.1, 0.15, 0.2],
    'n_estimators':[1000, 500, 2500],
    'colsample_bytree':[0.7, 0.4],
    'subsample':[0.7, 0.3],
    'reg_alpha':[0.5, 3],
    'reg_lambda':[1.0,0.2],
    'num_parallel_tree':[1, 7] 
}

grid_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=200)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_xgb_model_7 = grid_search.best_estimator_
y_pred = best_xgb_model_7.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)


In [17]:
rf_model = RandomForestRegressor()

param_grid = {
    'n_estimators': [20,50, 70],
    'max_depth': [None, 10, 20, 30],
    'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

[CV] END gamma=0.2, learning_rate=0.15, max_depth=8, max_leaves=200, n_estimators=50, tree_method=hist; total time=   1.0s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=8, max_leaves=200, n_estimators=75, tree_method=approx; total time=   1.3s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=8, max_leaves=0, n_estimators=50, tree_method=exact; total time=   0.6s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=8, max_leaves=0, n_estimators=75, tree_method=exact; total time=   1.1s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=10, max_leaves=25, n_estimators=25, tree_method=exact; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=10, max_leaves=25, n_estimators=50, tree_method=hist; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=10, max_leaves=25, n_estimators=75, tree_method=hist; total time=   0.4s
[CV] END gamma=0.2, learning_rate=0.15, max_depth=10, max_leaves=25, n_estimators=75, tree_method=hist; total time=   0.3s
[CV] END gamma=

In [18]:

rf_model = RandomForestRegressor()

param_grid = {
    'n_estimators': [20,50, 70],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap':[True , False]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_rf_model_2 = grid_search.best_estimator_
y_pred = best_rf_model_2.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/

Best parameters found:  {'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 70}
R-squared:  0.6984534884953917


In [19]:
rf_model = RandomForestRegressor()
 
param_grid = {
    'n_estimators': [20, 50],
    'max_depth': [None,5, 10],
    'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap':[True , False]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
 
best_rf_model_3 = grid_search.best_estimator_
y_pred = best_rf_model_3.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: ", r2)

KeyboardInterrupt: 

Ważne cechy

In [None]:
import lime
import shap

In [None]:
feat_importances = pd.DataFrame(best_xgb_model_6.feature_importances_, index=X_train.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='bar', figsize=(8,6))

shap.initjs()
# Create the explainer
explainer = shap.Explainer(best_xgb_model_6,  feature_names = X_train.columns)
# Evaluate SHAP values
shap_values = explainer.shap_values(X_test)
shap_values = explainer.shap_values(X_test[:30])
# check values for 30 - instances
shap.summary_plot(shap_values, X_test[:30])

In [None]:
explainer2 = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns.values.tolist(),
                                                  class_names=['median_house_value'], verbose=True, mode='regression')

# Choose the 5th instance and use it to predict the results
instance = X_train.iloc[4].values.reshape(1, -1)

# Generate explanation
explanation2 = explainer2.explain_instance(X_test.values[30], best_xgb_model_6.predict, num_features=8)

# Show the explanation
explanation2.as_pyplot_figure()
# plot graph

In [None]:
shap_values = explainer.shap_values(X_test)
print("Variable Importance Plot - Global Interpretation")
#graph summary
shap.summary_plot(shap_values, X_test, plot_type="bar")