In [20]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

## Import Data

In [21]:
data = pd.read_csv('..\product_data\processed_data.csv')
data = data.iloc[:,2:]
X = data.drop(['price'], axis = 1)
Y = data[['price']]
print(X)
print(Y)

      floor_area  age  floor  roof  room_num  hall_num  \
0             75    0     14     0         1         1   
1            165   17      5     0         3         1   
2            145    5      7     0         3         1   
3            165   15     10     0         3         1   
4            145   12      1     0         3         1   
...          ...  ...    ...   ...       ...       ...   
4137         215    0      9     0         4         1   
4138         200    0     10     0         4         1   
4139         200    2      8     0         4         1   
4140         110    0     10     0         3         1   
4141         130    0     11     0         3         1   

      neighborhood_19MayısMahallesi  neighborhood_AcıbademMahallesi  \
0                               0.0                             0.0   
1                               0.0                             0.0   
2                               0.0                             0.0   
3                  

## Train-Test Split

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25)

## Feature Selection With Brute Force

In [23]:
regression = GradientBoostingRegressor(n_estimators = 1000, random_state=1, alpha = 0.85)

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
x_except_neighborhood = x_train.iloc[:,[x for x in range(6)]]
print(x_except_neighborhood)
regression = GradientBoostingRegressor(n_estimators = 250, random_state=0, alpha = 0.85)
efs_regression = EFS(regression, scoring = 'neg_mean_squared_error', min_features = 1, max_features = 6, print_progress = True, cv = 20)
efs_regression.fit(x_except_neighborhood, y_train.values.ravel())
print(f'{efs_regression.best_score_}')
print(f'{efs_regression.best_idx_}')
print(f'{efs_regression.best_feature_names_}')
print(pd.DataFrame.from_dict(efs_regression.get_metric_dict()).T)

As seen above, roof column which specifies whether apartment has a terrace effects model in a negative manner. Optimal feature selection would exclude roof column.

In [24]:
x_train = x_train.drop('roof', axis = 1)
x_test = x_test.drop('roof', axis = 1)

In [25]:
regression.fit(x_train, y_train.values.ravel())

GradientBoostingRegressor(alpha=0.85, n_estimators=1000, random_state=1)

In [26]:
y_pred = regression.predict(x_test)

In [27]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

r2 = r2_score(y_test, y_pred)
adjusted_r2 = 1 - (1-r2)*((x_train.shape[0]-1)/(x_train.shape[0]-x_train.shape[1]-1))
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'R-squared: {r2}')
print(f'Adjusted R-squared: {adjusted_r2}')
print(f'Mean absolute percentage error: {mape}')

R-squared: 0.775967527037246
Adjusted R-squared: 0.7740757296039782
Mean absolute percentage error: 0.1547019827419966


In [11]:
data = data.drop(['roof'], axis = 1)

In [12]:
def encode_neighborhood(neighborhood: str):
    A = []
    for i in data.columns[6:]:
        if neighborhood in i:
            A = A + [1]
        else:
            A = A + [0]
    return A

In [13]:
import numpy as np
def predict_price(floor_area: int, age: int, floor: int, room_num : int , hall_num: int, neighborhood: str):
    A = [floor_area, age, floor,room_num, hall_num] + encode_neighborhood(neighborhood)
    A = np.array(A).reshape(1,-1)
    prediction = regression.predict(A)
    return prediction

In [14]:
print(predict_price(130,7,5,3,1,'Bostancı'))

[4094303.04460011]


In [15]:
print(x_test.columns)

Index(['floor_area', 'age', 'floor', 'room_num', 'hall_num',
       'neighborhood_19MayısMahallesi', 'neighborhood_AcıbademMahallesi',
       'neighborhood_BostancıMahallesi', 'neighborhood_CaddebostanMahallesi',
       'neighborhood_CaferağaMahallesi', 'neighborhood_DumlupınarMahallesi',
       'neighborhood_ErenköyMahallesi', 'neighborhood_EğitimMahallesi',
       'neighborhood_FenerbahçeMahallesi', 'neighborhood_FeneryoluMahallesi',
       'neighborhood_FikirtepeMahallesi', 'neighborhood_GöztepeMahallesi',
       'neighborhood_HasanpaşaMahallesi', 'neighborhood_KozyatağıMahallesi',
       'neighborhood_KoşuyoluMahallesi', 'neighborhood_MerdivenköyMahallesi',
       'neighborhood_OsmanağaMahallesi', 'neighborhood_RasimpaşaMahallesi',
       'neighborhood_SahrayıCeditMahallesi', 'neighborhood_SuadiyeMahallesi',
       'neighborhood_ZühtüpaşaMahallesi'],
      dtype='object')


In [16]:
trial = pd.read_csv(r'C:\Users\atiak\Desktop\hepsi_emlak\product_data\processed_data.csv')
trial = trial.iloc[:,2:]

In [17]:
trial.to_csv(r'C:\Users\atiak\Desktop\data.csv')

In [19]:
import joblib
filename = 'gradient_tree_model.joblib'
joblib.dump(regression, filename)

['gradient_tree_model.joblib']