## Verilerin linear regresyon ile tahmin edilmesi

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df=pd.read_csv('data/clean_data.csv')

In [3]:
df['district']=df['district'].astype('category')
df['neighborhood']=df['neighborhood'].astype('category')
df['type']=df['type'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2338 entries, 0 to 2337
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   district      2338 non-null   category
 1   neighborhood  2338 non-null   category
 2   square_meter  2338 non-null   float64 
 3   gold_gr       2338 non-null   float64 
 4   type          2338 non-null   category
dtypes: category(3), float64(2)
memory usage: 66.8 KB


In [4]:

categorical_features=['district','neighborhood','type']
numerical_features=['square_meter']

In [5]:
df.sample(5)

Unnamed: 0,district,neighborhood,square_meter,gold_gr,type
381,Çatalca,Çanakça,1177.0,1220.33151,2
2004,Sultanbeyli,Yavuz Selim,175.0,918.529094,0
1742,Silivri,Seymen,838.0,406.77717,2
1038,Silivri,Yeni,1260.0,984.138315,2
676,Arnavutköy,Mustafa Kemal Paşa,200.0,2427.541176,3


In [6]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [7]:
X=df.drop('gold_gr',axis=1)
y=df['gold_gr']

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 4779784.007142674
RMSE: 2186.2717139328024
R^2: 0.41743019480970545


In [12]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

430
[ 1.13782124e+03  1.19846559e+03 -8.06230479e+02  4.12663514e+03
  1.55698207e+03 -2.55568912e+03  2.33039592e+02  3.88967402e+02
 -2.05182327e+03 -1.12789061e+03  2.70264298e+03 -3.02441262e+02
  4.15342797e+02  8.59382177e+00  2.08208671e+03  6.85348309e+02
 -6.03922453e+02 -2.84066677e+02  3.03357487e+03  6.46338608e+02
 -1.30220184e+03 -1.12991842e+03  1.47430549e+03 -5.75867942e+03
 -1.92186406e+03  2.39205380e+02  1.87640883e+03 -1.57359461e+03
 -1.04667054e+03 -4.37608436e+02 -1.05674234e+03  1.63979733e+03
 -1.44960734e+03  1.78007177e+03  3.77859452e+02 -4.70025463e+02
 -1.12382079e+03  5.37130980e+02  2.50808277e+03 -3.44975710e+03
 -9.96506361e+02  4.81607540e+02 -1.28989878e+03 -4.65005727e+02
  1.39563880e+03 -6.62060246e+02 -1.81602288e+03  2.10328924e+02
 -1.25742968e+03 -1.42759510e+03  4.69154227e+03  7.85843557e+03
  1.57276082e+02 -1.15901196e+03  1.00058424e+04  2.33039592e+02
  7.77261185e+02 -4.07162465e+03 -3.50570932e+02 -1.62163060e+03
  1.73483809e+03  8.4

In [13]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
square_meter 1137.8212364293709


In [14]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
Adalar 1198.4655887666945
Arnavutköy -806.2304787693563
Ataşehir 4126.635135293238
Avcılar 1556.9820698110045
Bahçelievler -2555.689115811511
Bayrampaşa 233.0395915536774
Bağcılar 388.96740150999614
Başakşehir -2051.8232725699304
Beykoz -1127.890610706442
Beylikdüzü 2702.642984333838
Beyoğlu -302.4412620209239
Büyükçekmece 415.3427972535298
Esenler 8.593821770071122
Esenyurt 2082.086710371948
Eyüpsultan 685.3483091677409
Fatih -603.9224529751568
Gaziosmanpaşa -284.06667710301065
Güngören 3033.574870471353
Kadıköy 646.3386081267386
Kartal -1302.2018394767815
Kağıthane -1129.9184214447605
Küçükçekmece 1474.3054924771654
Maltepe -5758.67942053203
Pendik -1921.8640562757041
Sancaktepe 239.20537953755627
Sarıyer 1876.4088320644332
Silivri -1573.5946142450302
Sultanbeyli -1046.6705378726747
Sultangazi -437.6084364603505
Tuzla -1056.742339611261
Zeytinburnu 1639.79733433502
Çatalca -1449.6073363409205
Çekmeköy 1780.0717710785652
Ümraniye 377.85945182980925
Üsküdar -470.02

In [32]:
new_data = pd.DataFrame({
    'district': ['Silivri'],
    'neighborhood': ['İsmetpaşa'],
    'type': [2],
    'square_meter': [500]
})

print(model.predict(new_data))

[-346.95182024]


In [33]:
print(df[(df['district'] == 'Silivri') & (df['neighborhood'] == 'İsmetpaşa')& (df['type'] == 2)])

     district neighborhood  square_meter      gold_gr type
16    Silivri    İsmetpaşa         604.0   257.188146    2
103   Silivri    İsmetpaşa         447.0   393.655326    2
245   Silivri    İsmetpaşa        3845.0   649.531288    2
341   Silivri    İsmetpaşa        1125.0   354.289793    2
464   Silivri    İsmetpaşa         220.0   223.071351    2
516   Silivri    İsmetpaşa         270.0   170.583975    2
553   Silivri    İsmetpaşa        1004.0   259.812515    2
559   Silivri    İsmetpaşa         725.0   426.459936    2
581   Silivri    İsmetpaşa         595.0   249.315040    2
664   Silivri    İsmetpaşa         650.0   249.315040    2
674   Silivri    İsmetpaşa        1226.0   682.335898    2
756   Silivri    İsmetpaşa        1032.0   839.798029    2
857   Silivri    İsmetpaşa         246.0    90.540725    2
880   Silivri    İsmetpaşa         250.0   111.535676    2
966   Silivri    İsmetpaşa         235.0   196.827663    2
968   Silivri    İsmetpaşa         375.0   107.599122   

In [87]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [88]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.6720818495810821
1.0
0.7976844106035988
