In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
data = pd.read_csv('data_cleaned.csv')

In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          356 non-null    object
 1   district      356 non-null    object
 2   neighborhood  356 non-null    object
 3   room          356 non-null    int64 
 4   living_room   356 non-null    int64 
 5   area          356 non-null    int64 
 6   age           356 non-null    int64 
 7   floor         356 non-null    int64 
 8   price         356 non-null    int64 
dtypes: int64(6), object(3)
memory usage: 25.2+ KB
None


In [5]:
data['city'] = data['city'].astype('category')
data['district'] = data['district'].astype('category')
data['neighborhood'] = data['neighborhood'].astype('category')
data['room'] = data['room'].astype('int')
data['living_room'] = data['living_room'].astype('int')
data['area'] = data['area'].astype('int')
data['age'] = data['age'].astype('int')
data['floor'] = data['floor'].astype('int')
data['price'] = data['price'].astype('int')

In [6]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          356 non-null    category
 1   district      356 non-null    category
 2   neighborhood  356 non-null    category
 3   room          356 non-null    int64   
 4   living_room   356 non-null    int64   
 5   area          356 non-null    int64   
 6   age           356 non-null    int64   
 7   floor         356 non-null    int64   
 8   price         356 non-null    int64   
dtypes: category(3), int64(6)
memory usage: 34.6 KB
None


In [7]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [8]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [10]:
X = data.drop('price', axis=1)
y = data['price']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [13]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preparation', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [15]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 77985657.77293152
RMSE: 8830.94886028288
R^2: 0.5116839133690279


In [16]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

346
[-4.39075955e+03  0.00000000e+00  8.55166244e+03 -2.67997633e+03
  3.69642482e+02 -1.11816155e+03 -3.28105435e+03 -1.21465194e+03
  4.56322087e+03 -4.74725053e+01 -3.59004190e+03  1.09645249e+04
  4.67958252e+03 -3.49636267e+03  8.52458897e+02  5.13126378e+03
 -4.99742530e+03 -1.27896457e+03 -3.40434246e+03 -2.48924535e+03
 -9.24951840e+03 -2.01781849e+03 -4.04703432e+03  1.39639934e+04
  3.66894985e+03 -7.98574201e+03 -1.09159690e+03  3.38103527e+02
  4.66796461e+03 -1.59037263e+03 -1.30291880e+03  1.29557923e+04
  3.23172420e+03 -3.60913711e+03 -1.59542363e+03  2.99147683e+03
 -3.12136367e+03 -7.48040723e+03  5.03410636e+03 -1.30961250e+03
 -2.44325341e+01 -1.96392704e+03 -3.44813348e+03 -3.12136367e+03
  7.18814242e+02  1.46645135e+03 -7.13138169e+02  5.13126378e+03
  8.74602298e+03 -6.66229455e+03  6.32911383e+03  3.22355586e+03
  2.08569269e+03  4.18061312e+03  6.18779705e+03  1.28743842e+04
 -3.78878549e+03 -2.03976546e+03 -3.60664937e+03  7.10128660e+03
 -1.34519943e+04  1.3

In [17]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room -4390.759547285128
living_room 0.0
area 8551.662440572067
age -2679.9763289126317
floor 369.642481528895


In [18]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
adana -1118.1615521541116
aksaray -3281.05434896373
amasya -1214.6519351742631
ankara 4563.220872683214
antalya -47.47250529848559
aydin -3590.0418994475144
balikesir 10964.524944753008
bolu 4679.582520883026
burdur -3496.3626726500033
bursa 852.4588974929633
canakkale 5131.263776615076
corum -4997.425298063434
denizli -1278.9645728724824
duzce -3404.342456247201
eskisehir -2489.245349861578
gaziantep -9249.518396539344
hatay -2017.8184908786093
isparta -4047.0343195063288
istanbul 13963.993431637979
izmir 3668.949851416694
kayseri -7985.7420068659785
kirikkale -1091.596901850089
kirklareli 338.1035274310592
kocaeli 4667.964610854075
konya -1590.372627869528
mersin -1302.918798556544
mugla 12955.792309403221
rize 3231.724204801477
samsun -3609.137114974994
tekirdag -1595.4236307755584
tokat 2991.476834253185
trabzon -3121.3636747930177
zonguldak -7480.407228881726
adalar -1118.1615521541116
alanya -3281.05434896373
altieylul -1214.6519351742631
altindag 4563.220872

In [20]:
new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [4],
    'living_room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[33918.24235414]


In [22]:
print(data[(data['city'] == 'manisa') & (data['district'] == 'yunusemre') & (data['neighborhood'] == 'guzelyurt')])

Empty DataFrame
Columns: [city, district, neighborhood, room, living_room, area, age, floor, price]
Index: []


In [23]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [24]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.5116839133690279
0.6333018826780294
0.700380543726921
