In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

import pandas as pd
import pickle
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df = pd.read_csv("merge_dataset.csv", sep="\t")
df.drop(columns=[
    "Miasto", 
    "URL",
    "Pietro", 
    "formatted_address",
    "result_confidence",
    "suburb",
    "building_category",
    "result_type",
    "Dataset"
], inplace=True)

In [None]:
df1 = df.drop(columns=["Rynek", "Stan wykonczenia"])
df1.fillna(0, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(df1.iloc[:, 1:], df1["Cena"], test_size=0.2, random_state=42)

### Baseline models

In [None]:
def print_metrics(model, x_train, x_test, y_train, y_test):
    print( model.get_params())
    print("Train dataset")
    y_predicted = model.predict(x_train)
    print("R2: ", r2_score(y_train, y_predicted))
    print("Mean squared error: ",  mean_squared_error(y_train, y_predicted))
    print("Root mean squared error: ",  mean_squared_error(y_train, y_predicted, squared=False))
    print("Mean absolute error: ",  mean_absolute_error(y_train, y_predicted))
    print("Mean absolute percentage error: ",  mean_absolute_percentage_error(y_train, y_predicted))
    print("Test dataset")
    y_predicted = model.predict(x_test)
    print("R2: ", r2_score(y_test, y_predicted))
    print("Mean squared error: ",  mean_squared_error(y_test, y_predicted))
    print("Root mean squared error: ",  mean_squared_error(y_test, y_predicted, squared=False))
    print("Mean absolute error: ",  mean_absolute_error(y_test, y_predicted))
    print("Mean absolute percentage error: ",  mean_absolute_percentage_error(y_test, y_predicted))

In [None]:
model = RandomForestRegressor().fit(x_train, y_train)
print_metrics(model, x_train, x_test, y_train, y_test)

In [None]:
model = GradientBoostingRegressor().fit(x_train, y_train)
print_metrics(model, x_train, x_test, y_train, y_test)

In [None]:
model = LinearRegression().fit(x_train, y_train)
r2 = r2_score(y_train, model.predict(x_train))
mse = mean_squared_error(y_train, model.predict(x_train))
print(r2)
print(mse)
r2 = r2_score(y_test, model.predict(x_test))
mse = mean_squared_error(y_test, model.predict(x_test))
print(r2)
print(mse)

In [None]:
model = MLPRegressor().fit(x_train, y_train)
print_metrics(model, x_train, x_test, y_train, y_test)

In [None]:
model = SVR().fit(x_train, y_train)
r2 = r2_score(y_train, model.predict(x_train))
mse = mean_squared_error(y_train, model.predict(x_train))
print(r2)
print(mse)
r2 = r2_score(y_test, model.predict(x_test))
mse = mean_squared_error(y_test, model.predict(x_test))
print(r2)
print(mse)

### Outlier detection and data scaling

In [None]:
df = pd.read_csv("merge_dataset.csv", sep="\t")
df.drop(columns=[
    "Miasto", 
    "URL",
    "Pietro", 
    "formatted_address",
    "result_confidence",
    "suburb",
    "building_category",
    "result_type",
    "Dataset"
], inplace=True)

In [None]:
t_suma = pd.qcut(df['Cena'], 30, retbins=True, duplicates='drop')[1]  
t_suma

In [None]:
sns.histplot(df['Cena'], color = 'r', bins=100)
plt.title('Sale Price Distribution', fontsize = 16)
plt.xlabel('Sale Price', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

In [None]:
df = df[(df["Cena"]>250000)&(df["Cena"]<4000000)]

In [None]:
sns.histplot(df['Cena'], color = 'r', bins=100)
plt.title('Sale Price Distribution', fontsize = 16)
plt.xlabel('Sale Price', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

In [None]:
sns.scatterplot(data=df, x="Cena", y="Powierzchnia", color = 'orange', edgecolor = 'b', s = 15)
plt.title('{} / Sale Price'.format("Powierzchnia"), fontsize = 16)
# plt.xlabel('{}'.format(i[0]), fontsize = 14)
plt.ylabel('Powierzchnia', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# plt.savefig('scatter1.png')
plt.show()

In [None]:
df = df[(df["Powierzchnia"]<300)]

In [None]:
sns.scatterplot(data=df, x="Cena", y="Powierzchnia", color = 'orange', edgecolor = 'b', s = 15)
plt.title('{} / Sale Price'.format("Powierzchnia"), fontsize = 16)
# plt.xlabel('{}'.format(i[0]), fontsize = 14)
plt.ylabel('Powierzchnia', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# plt.savefig('scatter1.png')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
sns.heatmap(x_train.corr(), vmax=1, center=0, fmt='.2f', square=True, linewidths=.5, cmap='coolwarm', annot=True)
ax.set_ylim(len(x_train.columns))
plt.xticks(rotation = 90, fontsize=8)
plt.show()

In [None]:
train, test, = train_test_split(df, test_size=0.2, random_state=42)
train_outlier = train.copy()
test_outlier = test.copy()
train_outlier = train_outlier[(train_outlier["Cena"]>250000)&(train_outlier["Cena"]<4000000)]
test_outlier = test_outlier[(test_outlier["Cena"]>250000)&(test_outlier["Cena"]<4000000)]
train_outlier = train_outlier[(train_outlier["Powierzchnia"]<300)]
test_outlier = test_outlier[(test_outlier["Powierzchnia"]<300)]

In [None]:
numeric_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
categorical_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
binary_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
categorical_encoder = OneHotEncoder(drop="first")

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", categorical_imputer),
        ("encoder", categorical_encoder)
    ]
)
binary_pipeline = binary_imputer
numeric_pipeline = numeric_imputer

preprocessing = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, make_column_selector(dtype_include=float)),
        ("cat", categorical_pipeline, make_column_selector(dtype_include=object)),
        ("bin", binary_pipeline, make_column_selector(dtype_include=int))
    ]
)

In [None]:
preprocessor = preprocessing.fit(train_outlier.iloc[:, 1:])
num_features = list(preprocessing.named_transformers_["num"].feature_names_in_)
cat_features = list(preprocessing.named_transformers_["cat"].steps[1][1].get_feature_names_out())
bin_features = list(preprocessing.named_transformers_["bin"].feature_names_in_)
features = num_features + cat_features + bin_features

In [None]:
train_preprocessed = pd.DataFrame(preprocessor.transform(train_outlier.iloc[:, 1:]), columns=features)
y_train = train_outlier.iloc[:, 0]
test_preprocessed = pd.DataFrame(preprocessor.transform(test_outlier.iloc[:, 1:]), columns=features)
y_test = test_outlier.iloc[:, 0]

### Random Forest Regressor

In [None]:
model = RandomForestRegressor().fit(train_preprocessed, y_train)
print_metrics(model, train_preprocessed, test_preprocessed, y_train, y_test)

In [None]:
random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson", None],
               'max_depth': [int(x) for x in np.linspace(10, 200, num = 5)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4, 8, 16],
               'bootstrap': [True, False]}

rf_search = RandomizedSearchCV(
    estimator = RandomForestRegressor(), 
    param_distributions = random_grid, 
    n_iter = 30, 
    cv = 2, 
    verbose=2, 
    random_state=42
).fit(train_preprocessed, y_train)

In [None]:
m1 = joblib.load("models/grid_random_forest.pkl")
m1.best_estimator_.get_params()

In [None]:
print_metrics(m1, train_preprocessed, test_preprocessed, y_train, y_test)

In [None]:
rf_search.best_estimator_.get_params()

In [None]:
print_metrics(rf_search, train_preprocessed, test_preprocessed, y_train, y_test)

### Gradient Boosting Regressor

In [None]:
model = GradientBoostingRegressor().fit(train_preprocessed, y_train)
print_metrics(model, train_preprocessed, test_preprocessed, y_train, y_test)

In [None]:
random_grid = {
    "loss": ["squared_error", "absolute_error", "huber", "quantile"],
    'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)],
    'learning_rate':[0.00001, 0.0001, 0.001,0.01, 0.1, 0.5, 1, 2, 5, 10, 30],
    'max_depth': [int(x) for x in np.linspace(start = 1, stop = 200, num = 5)],
    'subsample':[0.001, 0.01, 0.1, 0.5, 0.75,1],
    'random_state': [1]
}

gbr_search = RandomizedSearchCV(
    estimator = GradientBoostingRegressor(), 
    param_distributions = random_grid, 
    n_iter = 30, 
    cv = 2, 
    verbose=2, 
    random_state=42
).fit(train_preprocessed, y_train)

In [None]:
gbr_search.best_estimator_.get_params()

In [None]:
print_metrics(gbr_search, train_preprocessed, test_preprocessed, y_train, y_test)

## MLP

In [None]:
model = MLPRegressor().fit(train_preprocessed, y_train)
print_metrics(model, train_preprocessed, test_preprocessed, y_train, y_test)

In [None]:
random_grid = {
    "hidden_layer_sizes": [(20,), (50,), (100,), (200,), (500,), (1000,), (2500,)],
    'activation': ["identity", "logistic", "tanh", "relu"],
    'solver': ["lbfgs", "sgd", "adam"],
    'alpha': [0.0000001, 0.000001, 0.0000, 0.0001, 0.001,0.01, 0.1, 0.5, 1],
    'learning_rate':["constant", "invscaling", "adaptive"],
    'max_iter': [int(x) for x in np.linspace(start = 50, stop = 1000, num = 20)],
    'momentum':[0.001, 0.01, 0.1, 0.5, 0.75, 0.9, 1],
    'random_state': [1]
}

mlp_search = RandomizedSearchCV(
    estimator = MLPRegressor(), 
    param_distributions = random_grid, 
    n_iter = 30, 
    cv = 2, 
    verbose=2, 
    random_state=42
).fit(train_preprocessed, y_train)

In [None]:
model1 = joblib.load("models/grid_mlp.pkl")

In [None]:
model1.best_estimator_.get_params()

In [None]:
joblib.dump(mlp_search, 'models/grid_mlp.pkl')

In [None]:
mlp_search.best_estimator_.get_params()

In [None]:
print_metrics(mlp_search, train_preprocessed, test_preprocessed, y_train, y_test)

### Linear regression 

In [None]:
model = LinearRegression().fit(train_preprocessed, y_train)
r2 = r2_score(y_train, model.predict(train_preprocessed))
mse = mean_squared_error(y_train, model.predict(train_preprocessed))
print(r2)
print(mse)
r2 = r2_score(y_test, model.predict(test_preprocessed))
mse = mean_squared_error(y_test, model.predict(test_preprocessed))
print(r2)
print(mse)

In [None]:
model = BayesianRidge().fit(train_preprocessed, y_train)
r2 = r2_score(y_train, model.predict(train_preprocessed))
mse = mean_squared_error(y_train, model.predict(train_preprocessed))
print(r2)
print(mse)
r2 = r2_score(y_test, model.predict(test_preprocessed))
mse = mean_squared_error(y_test, model.predict(test_preprocessed))
print(r2)
print(mse)

In [None]:
random_grid = {
    "n_iter": [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)],,
    'tol': [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 1, 2, 5],
    'alpha_1': [0.0000001, 0.000001, 0.0000, 0.0001, 0.001,0.01, 0.1, 0.5, 1],
    'alpha_2': [0.0000001, 0.000001, 0.0000, 0.0001, 0.001,0.01, 0.1, 0.5, 1],
    'lambda_1': [0.0000001, 0.000001, 0.0000, 0.0001, 0.001,0.01, 0.1, 0.5, 1],
    'lambda_2': [0.0000001, 0.000001, 0.0000, 0.0001, 0.001,0.01, 0.1, 0.5, 1],
}

br_search = BayesianRidge(
    estimator = BayesianRidge(), 
    param_distributions = random_grid, 
    n_iter = 20, 
    cv = 2, 
    verbose=2, 
    random_state=42
).fit(train_preprocessed, y_train)

In [None]:
r2 = r2_score(y_train, br_search.predict(train_preprocessed))
mse = mean_squared_error(y_train, br_search.predict(train_preprocessed))
print(r2)
print(mse)
r2 = r2_score(y_test, br_search.predict(test_preprocessed))
mse = mean_squared_error(y_test, br_search.predict(test_preprocessed))
print(r2)
print(mse)

## feature selection

In [None]:
Trr=[]
Tss=[]

m=10
for i in range(1, m):
    lm = RandomForestRegressor()
    rfe = RFE(lm,n_features_to_select=i)           
    rfe = rfe.fit(train_preprocessed, y_train)
    
    print(rfe.support_)
    LR = LinearRegression()
    LR.fit(train_preprocessed.loc[:,rfe.support_], y_train)

    pred1 = LR.predict(train_preprocessed.loc[:,rfe.support_])
    pred2 = LR.predict(test_preprocessed.loc[:,rfe.support_])

    Trr.append(mean_squared_error(y_train, pred1))
    Tss.append(mean_squared_error(y_test, pred2))
    print(i)

plt.plot(Trr, label='Train RMSE')
plt.plot(Tss, label='Test RMSE')
#plt.ylim([19.75,20.75])
plt.legend()
plt.grid()
plt.show()

In [None]:
selector = RFE(RandomForestRegressor(), n_features_to_select=5, step=1)
selector = selector.fit(train_preprocessed, y_train)

In [None]:
r2 = r2_score(y_train, selector.predict(train_preprocessed))
mse = mean_squared_error(y_train, selector.predict(train_preprocessed))
print(r2)
print(mse)
r2 = r2_score(y_test, selector.predict(test_preprocessed))
mse = mean_squared_error(y_test, selector.predict(test_preprocessed))
print(r2)
print(mse)

In [None]:
from sklearn.feature_selection import SelectKBest, RFE, f_regression
import numpy as np

bestfeatures = SelectKBest(score_func=f_regression, k='all')
fit = bestfeatures.fit(train_preprocessed, y_train)
print(fit.scores_)
dfscores = pd.DataFrame(np.around(fit.scores_, 2))
dfcolumns = pd.DataFrame(train_preprocessed.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(30,'Score'))  
featureScores.head(100)