# Preprocessing

In [1]:
import pandas as pd

data = pd.read_csv(r'product_data/data.csv')
print(data)

      Unnamed: 0                                             price room_hall  \
0              0   \r\n              2.300.000\r\n              TL     1 + 1   
1              1   \r\n              4.690.000\r\n              TL     3 + 1   
2              2   \r\n              6.500.000\r\n              TL     3 + 1   
3              3   \r\n              6.300.000\r\n              TL     3 + 1   
4              4   \r\n              5.500.000\r\n              TL     3 + 1   
...          ...                                               ...       ...   
4441        4441   \r\n              9.500.000\r\n              TL     4 + 1   
4442        4442  \r\n              12.120.000\r\n              TL     4 + 1   
4443        4443   \r\n              4.300.000\r\n              TL     4 + 1   
4444        4444  \r\n              10.400.000\r\n              TL     3 + 1   
4445        4445   \r\n              3.650.000\r\n              TL     3 + 1   

                                floor_a

In [2]:
#function for converting price column to mere numeric values
def convert_numeric(x:str):
    for letter in x:
        if letter.isdigit() == False:
            x = x.replace(letter,'')
    if len(x) == 0:
        return 0
    else:
        return int(x)

In [3]:
# Exclude spaces and newlines
def exclude_spaces(x:str):
    return ''.join(x.split())
data = data.astype(str)
data = data.applymap(exclude_spaces)
print(data)

     Unnamed: 0         price room_hall floor_area        age   floor  \
0             0   2.300.000TL       1+1       75m2  SıfırBina  14.Kat   
1             1   4.690.000TL       3+1      165m2  17Yaşında   5.Kat   
2             2   6.500.000TL       3+1      145m2   5Yaşında   7.Kat   
3             3   6.300.000TL       3+1      165m2  15Yaşında  10.Kat   
4             4   5.500.000TL       3+1      145m2  12Yaşında   1.Kat   
...         ...           ...       ...        ...        ...     ...   
4441       4441   9.500.000TL       4+1      215m2  SıfırBina   9.Kat   
4442       4442  12.120.000TL       4+1      200m2  SıfırBina  10.Kat   
4443       4443   4.300.000TL       4+1      200m2   2Yaşında   8.Kat   
4444       4444  10.400.000TL       3+1      110m2  SıfırBina  10.Kat   
4445       4445   3.650.000TL       3+1      130m2  SıfırBina  11.Kat   

                       neighborhood  
0     Kadıköy,SahrayıCeditMahallesi  
1        Kadıköy,FeneryoluMahallesi  
2        

## Data Mapping and Imputation

In [4]:
# data mapping
data['room_hall'] = data['room_hall'].map({
    'Stüdyo':'1 + 0'}).fillna(data['room_hall'])

data['age'] = data['age'].map({
    'SıfırBina': '0'}).fillna(data['age'])
# whether the apartman at roof or not
data['roof'] = data['floor'].map({
    'ÇatıKatı':'1',
    'EnÜstKat':'1',
    }).fillna('0')

data['floor'] = data['floor'].map({
    'Zemin' : '0',
    'GirişKatı': '0',
    'YüksekGiriş':'0',
    'BahçeKatı':'0',
    # imputing uncertain values by median
    'AraKat':str(int(data['floor'].apply(convert_numeric).mean())),
    'ÇatıKatı':str(int(data['floor'].apply(convert_numeric).mean())),
    'EnÜstKat':str(int(data['floor'].apply(convert_numeric).mean())),
    '':str(data['floor'].apply(convert_numeric).median())
    }).fillna(data['floor'])
print(data)

     Unnamed: 0         price room_hall floor_area        age   floor  \
0             0   2.300.000TL       1+1       75m2          0  14.Kat   
1             1   4.690.000TL       3+1      165m2  17Yaşında   5.Kat   
2             2   6.500.000TL       3+1      145m2   5Yaşında   7.Kat   
3             3   6.300.000TL       3+1      165m2  15Yaşında  10.Kat   
4             4   5.500.000TL       3+1      145m2  12Yaşında   1.Kat   
...         ...           ...       ...        ...        ...     ...   
4441       4441   9.500.000TL       4+1      215m2          0   9.Kat   
4442       4442  12.120.000TL       4+1      200m2          0  10.Kat   
4443       4443   4.300.000TL       4+1      200m2   2Yaşında   8.Kat   
4444       4444  10.400.000TL       3+1      110m2          0  10.Kat   
4445       4445   3.650.000TL       3+1      130m2          0  11.Kat   

                       neighborhood roof  
0     Kadıköy,SahrayıCeditMahallesi    0  
1        Kadıköy,FeneryoluMahallesi  

## Data Conversion

In [5]:
# removing meter-square notations
data['floor_area'] = data['floor_area'].apply(lambda x: x[:-2])
# removing 'Kadıköy' because it is constant for all data
data['neighborhood'] = data['neighborhood'].apply(lambda x: x[8:])
# seperating room and hall numbers
data['room_num'] = data['room_hall'].str.split('+', expand = True).iloc[:,0]
data['hall_num'] = data['room_hall'].str.split('+', expand = True).iloc[:,1]
data = data.drop(['room_hall'], axis = 1)
# cleaning unnumeric values except neighborhood
for key in ['price','room_num','hall_num','floor_area','age','floor','roof']:
    data[key] = data[key].transform(convert_numeric)
    
# converting categorical values to numeric by dummy transformation.
# notice that drop_first parameter set as False (also by default),
# to save symetry between neighborhoods.
data = pd.get_dummies(data,columns = ['neighborhood'], dtype = float)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446 entries, 0 to 4445
Data columns (total 29 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Unnamed: 0                          4446 non-null   object 
 1   price                               4446 non-null   int64  
 2   floor_area                          4446 non-null   int64  
 3   age                                 4446 non-null   int64  
 4   floor                               4446 non-null   int64  
 5   roof                                4446 non-null   int64  
 6   room_num                            4446 non-null   int64  
 7   hall_num                            4446 non-null   int64  
 8   neighborhood_19MayısMahallesi       4446 non-null   float64
 9   neighborhood_AcıbademMahallesi      4446 non-null   float64
 10  neighborhood_BostancıMahallesi      4446 non-null   float64
 11  neighborhood_CaddebostanMahallesi   4446 no

## Removing Outliers

In [6]:
# in order to remove outliers in data, we may use common sense
# conditions = 'price>=300000 and floor_area>30 and age<=50 and room_num<15 and hall_num<6 and (floor<20)' 
data = data.query('price>=300000')
data = data.query('floor_area>30')
data = data.query('age<=35')
data = data.query('room_num<15')
data = data.query('hall_num<6')
# changing name of column to avoid overlap, after query it will be taken back.
data['Floor'] = data['floor']
data = data.query('Floor<15')
data = data.drop(['Floor'], axis = 1)
# removing newlines from column names.
data.columns = [x.replace('\r\n','') for x in data.columns]
col_names = data.columns
print(col_names)
print(data)
data.to_csv('product_data\processed_data.csv')

Index(['Unnamed: 0', 'price', 'floor_area', 'age', 'floor', 'roof', 'room_num',
       'hall_num', 'neighborhood_19MayısMahallesi',
       'neighborhood_AcıbademMahallesi', 'neighborhood_BostancıMahallesi',
       'neighborhood_CaddebostanMahallesi', 'neighborhood_CaferağaMahallesi',
       'neighborhood_DumlupınarMahallesi', 'neighborhood_ErenköyMahallesi',
       'neighborhood_EğitimMahallesi', 'neighborhood_FenerbahçeMahallesi',
       'neighborhood_FeneryoluMahallesi', 'neighborhood_FikirtepeMahallesi',
       'neighborhood_GöztepeMahallesi', 'neighborhood_HasanpaşaMahallesi',
       'neighborhood_KozyatağıMahallesi', 'neighborhood_KoşuyoluMahallesi',
       'neighborhood_MerdivenköyMahallesi', 'neighborhood_OsmanağaMahallesi',
       'neighborhood_RasimpaşaMahallesi', 'neighborhood_SahrayıCeditMahallesi',
       'neighborhood_SuadiyeMahallesi', 'neighborhood_ZühtüpaşaMahallesi'],
      dtype='object')
     Unnamed: 0     price  floor_area  age  floor  roof  room_num  hall_num  \
0

## Standardize Data

In [7]:
from sklearn.preprocessing import StandardScaler
# Z-transformation
scaler_1 = StandardScaler()
data = scaler_1.fit_transform(data)
data = pd.DataFrame(data,columns = col_names)
print(data)

      Unnamed: 0     price  floor_area       age     floor      roof  \
0      -1.732800 -0.827611   -1.362651 -0.933851  2.265648 -0.146475   
1      -1.732026 -0.228577    0.053251  0.438987 -0.141615 -0.146475   
2      -1.731252  0.225086   -0.261394 -0.530075  0.393332 -0.146475   
3      -1.730478  0.174957    0.053251  0.277476  1.195753 -0.146475   
4      -1.729704 -0.025556   -0.261394  0.035211 -1.211510 -0.146475   
...          ...       ...         ...       ...       ...       ...   
4137    1.704476  0.977012    0.839863 -0.933851  0.928280 -0.146475   
4138    1.705250  1.633694    0.603880 -0.933851  1.195753 -0.146475   
4139    1.706024 -0.326327    0.603880 -0.772340  0.660806 -0.146475   
4140    1.706798  1.202589   -0.812022 -0.933851  1.195753 -0.146475   
4141    1.707572 -0.489244   -0.497377 -0.933851  1.463227 -0.146475   

      room_num  hall_num  neighborhood_19MayısMahallesi  \
0    -2.538093 -0.286277                      -0.201751   
1    -0.177527 -0

In [8]:
data.to_csv('product_data\scaled_data.csv')

## Linear Correlation Coefficients

In [9]:
data = pd.read_csv(r'product_data\processed_data.csv')
data = data.iloc[:,2:]
print(data.corr()[['price']])

                                       price
price                               1.000000
floor_area                          0.669734
age                                -0.293310
floor                               0.360091
roof                                0.083652
room_num                            0.562458
hall_num                            0.361919
neighborhood_19MayısMahallesi      -0.086548
neighborhood_AcıbademMahallesi     -0.042281
neighborhood_BostancıMahallesi     -0.042898
neighborhood_CaddebostanMahallesi   0.282342
neighborhood_CaferağaMahallesi     -0.053030
neighborhood_DumlupınarMahallesi   -0.052912
neighborhood_ErenköyMahallesi      -0.106554
neighborhood_EğitimMahallesi       -0.047418
neighborhood_FenerbahçeMahallesi    0.216868
neighborhood_FeneryoluMahallesi    -0.117980
neighborhood_FikirtepeMahallesi    -0.052175
neighborhood_GöztepeMahallesi      -0.071279
neighborhood_HasanpaşaMahallesi    -0.082622
neighborhood_KozyatağıMahallesi    -0.088423
neighborho

# 