## Verilerin lenear regresyon ile tahmin edilmesi

In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [71]:
df=pd.read_csv('data/clean_data.csv')

In [72]:
df['city']=df['city'].astype('category')
df['district']=df['district'].astype('category')
df['neighborhood']=df['neighborhood'].astype('category')
df['type']=df['type'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2278 entries, 0 to 2277
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          2278 non-null   category
 1   district      2278 non-null   category
 2   neighborhood  2278 non-null   category
 3   square_meter  2278 non-null   float64 
 4   gold_gr       2278 non-null   float64 
 5   type          2278 non-null   category
dtypes: category(4), float64(2)
memory usage: 59.5 KB


In [73]:

categorical_features=['city','district','neighborhood','type']
numerical_features=['square_meter']

In [74]:
df.sample(5)

Unnamed: 0,city,district,neighborhood,square_meter,gold_gr,type
279,İstanbul,Silivri,Fener Mah.,221.0,1.187497,1
1838,İstanbul,Başakşehir,Bahçeşehir 2. Kısım Mah.,395.0,9.268341,3
204,İstanbul,Beykoz,Riva Mah.,710.0,6.653329,3
1649,İstanbul,Esenyurt,Osmangazi Mah.,255.0,14.922882,3
813,İstanbul,Silivri,Akören Mah.,4378.0,1.108973,2


In [75]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [76]:
X=df.drop('gold_gr',axis=1)
y=df['gold_gr']

In [77]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [79]:
model.fit(X_train, y_train)

In [80]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 5.383414013869721
RMSE: 2.3202185271800846
R^2: 0.6720818495810821


In [81]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

385
[-2.76612092e-02  5.07968730e-12  5.33185325e+00 -3.66974772e+00
  3.18038666e+00 -6.74298915e-01  3.49789565e+00 -6.45168662e+00
  7.52601855e+00 -5.17371770e+00 -2.25508075e+00  2.86610024e+00
  3.75021770e+00 -1.83813194e+00  1.55333296e+00  2.92205604e+00
  3.52143840e+00  3.85744630e+00  2.25756714e+00  1.65610218e+00
  6.50413547e+00  1.38616435e+00 -5.09072164e+00 -1.13783335e+00
 -2.61232371e+00 -4.14823450e+00 -2.21161571e+00 -7.51715426e-01
 -4.01475818e+00  1.19961137e+00 -6.14525124e+00  3.00798260e+00
 -5.88455591e+00  1.61781557e+00 -3.69365390e+00  1.17202765e-01
 -1.76919771e+00 -4.49949492e+00 -1.62325214e+00  1.82126702e+00
 -2.06914191e+00  2.63857292e+00  9.54462343e-01  9.86122470e-03
  3.75934443e+00 -1.97984958e+00 -2.26640282e+00 -3.23241932e-01
 -1.95011547e+00 -2.46600224e+00  1.59081161e+00 -5.82406338e-01
 -3.02872438e+00 -1.94948158e+00  4.15038320e+00  5.48308702e+00
  1.00318872e+00  5.47456773e+00  5.40432612e+00  2.97806203e+00
 -2.25888248e+00  5.6

In [82]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
square_meter -0.027661209178147534


In [83]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
İstanbul 5.079687302936924e-12
Adalar 5.079687302936924e-12
Arnavutköy 5.331853247391234
Ataşehir -3.6697477192595294
Avcılar 3.1803866602271964
Bahçelievler -0.6742989154301606
Bayrampaşa 3.49789565378177
Bağcılar -6.451686622519867
Başakşehir 7.526018551388165
Beykoz -5.173717695101215
Beylikdüzü -2.2550807525615832
Beyoğlu 2.86610023956954
Büyükçekmece 3.750217700370376
Esenyurt -1.8381319428860243
Eyüpsultan 1.553332963287751
Fatih 2.9220560375962377
Gaziosmanpaşa 3.521438402998315
Kartal 3.8574463048743257
Kağıthane 2.2575671379863773
Küçükçekmece 1.6561021784520515
Maltepe 6.504135465164894
Pendik 1.3861643501997596
Sancaktepe -5.090721638315794
Sarıyer -1.137833350387768
Silivri -2.612323706838312
Sultanbeyli -4.148234499431321
Sultangazi -2.2116157054704133
Tuzla -0.7517154264732556
Zeytinburnu -4.014758181268478
Çatalca 1.1996113732522833
Çekmeköy -6.145251236631494
Ümraniye 3.007982598305531
Üsküdar -5.884555912281927
Şile 1.6178155710950266
Şişli -3.6936

In [85]:
new_data = pd.DataFrame({
    'city': ['İstanbul'],
    'district': ['Silivri'],
    'neighborhood': ['İsmetpaşa Mah.'],
    'type': [2],
    'square_meter': [1275]
})

print(model.predict(new_data))

[0.45108385]


In [86]:
print(df[(df['city'] == 'İstanbul') & (df['district'] == 'Silivri') & (df['neighborhood'] == 'İsmetpaşa Mah.')& (df['type'] == 2)])

          city district    neighborhood  square_meter   gold_gr type
16    İstanbul  Silivri  İsmetpaşa Mah.         604.0  0.425808    2
100   İstanbul  Silivri  İsmetpaşa Mah.         447.0  0.880661    2
233   İstanbul  Silivri  İsmetpaşa Mah.        3845.0  0.168929    2
328   İstanbul  Silivri  İsmetpaşa Mah.        1125.0  0.314924    2
451   İstanbul  Silivri  İsmetpaşa Mah.         220.0  1.013961    2
498   İstanbul  Silivri  İsmetpaşa Mah.         270.0  0.631792    2
532   İstanbul  Silivri  İsmetpaşa Mah.        1004.0  0.258777    2
537   İstanbul  Silivri  İsmetpaşa Mah.         725.0  0.588221    2
560   İstanbul  Silivri  İsmetpaşa Mah.         595.0  0.419017    2
640   İstanbul  Silivri  İsmetpaşa Mah.         650.0  0.383562    2
651   İstanbul  Silivri  İsmetpaşa Mah.        1226.0  0.556555    2
727   İstanbul  Silivri  İsmetpaşa Mah.        1032.0  0.813758    2
822   İstanbul  Silivri  İsmetpaşa Mah.         246.0  0.368052    2
843   İstanbul  Silivri  İsmetpaşa

In [87]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [88]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.6720818495810821
1.0
0.7976844106035988
