In [8]:
import pandas as pd 
import numpy as np
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.model_selection import train_test_split
import category_encoders as ce
import matplotlib.pyplot as plt

In [3]:
df = pd.read_excel("house_price_Test.xlsx",index_col=0)

In [4]:
filtered_df = df[df["District"] == "Çankaya"]

# Drop the 'Neighborhood' column
filtered_df = filtered_df.drop(columns=['Neighborhood',"District"], axis=1)

# Select all columns except the last one
X = filtered_df.iloc[:, :-1].values

In [5]:
X

array([[  8., 340.,  18., ...,   1.,   0.,   0.],
       [  4., 120.,  26., ...,   1.,   0.,   0.],
       [  5., 147.,  30., ...,   0.,   1.,   0.],
       ...,
       [  6., 180.,  25., ...,   1.,   0.,   0.],
       [  4., 155.,  23., ...,   1.,   0.,   0.],
       [  4., 125.,  40., ...,   0.,   1.,   0.]])

In [6]:

Y = df["Price"].values/1000000

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

City                    0
District                0
Neighborhood            0
Oda + Salon Sayısı      0
Brüt / Net              0
Bina Yaşı               0
Kat Sayısı              1
Eşya Durumu           127
Banyo Sayısı            6
Yapının Durumu        637
Aidat                 809
Rent                    0
dtype: int64

In [None]:
#Neighborhood_index = df.columns.get_loc('Neighborhood')
#District_index = df.columns.get_loc('District')
#encoder = ce.TargetEncoder(cols=[Neighborhood_index,District_index])
#encoder.fit(X_train, y_train)
#X_train_encoded = encoder.transform(X_train).values
#X_test_encoded = encoder.transform(X_test).values

In [55]:
X_train_norm = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
X_test_norm = (X_test - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

In [None]:
xgb =  xg.XGBRegressor(objective ='reg:squarederror',
                                 learning_rate =0.01, # smaller learning rate
                                 n_estimators = 1000, # larger number of trees
                                 max_depth = 12, 
                                  # L1 regularization
                                 # L2 regularization
                                 subsample = 0.5, # fraction of samples used for each tree
                                 colsample_bytree = 0.2, # fraction of columns used for each tree
                            # early stopping after 50 rounds without improvement
                                 eval_metric = 'rmse', # evaluation metric
                                 verbose = True) # set to True to see training progress

In [None]:
xgb.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred_train = xgb.predict(X_train)
y_pred_test = xgb.predict(X_test)

In [None]:
#Mean Square Error Calculation
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
print(f'Train MSE: {mse_train}')
print(f'Test MSE: {mse_test}')

In [None]:
lower_threshold = np.percentile(y_test, 1) # 1% percentile
upper_threshold = np.percentile(y_test, 99) # 99% percentile

# Truncate outliers
y_test_clipped = np.clip(y_test, lower_threshold, upper_threshold)
y_pred_clipped = np.clip(y_pred_test, lower_threshold, upper_threshold)

# Plot the scatter plot
plt.scatter(y_test_clipped, y_pred_clipped, c='crimson')
plt.xlabel('y_test')
plt.ylabel('y_pred')
plt.show()

In [56]:
y_test_log = np.log1p(y_test)
y_pred_log = np.log1p(y_pred)

plt.scatter(y_test_log, y_pred_log, c='crimson')
plt.xlabel('log(y_test + 1)')
plt.ylabel('log(y_pred + 1)')
plt.show()

City                    0
District                0
Neighborhood            0
Oda + Salon Sayısı      0
Brüt / Net              0
Bina Yaşı               0
Kat Sayısı              0
Eşya Durumu           127
Banyo Sayısı            6
Yapının Durumu        637
Aidat                 809
Rent                    0
dtype: int64

In [57]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)
