# Preparing the data
Combining data into two datasets, one that contains all rent prices and one that contains all buy prices.

In [None]:
import pandas as pd
import glob
import os
buy_files = glob.glob(os.path.join('./dataset/', 'apartments_pl*'))
rent_files = glob.glob(os.path.join('./dataset/', 'apartments_rent*'))

buy_df = pd.DataFrame()
rent_df = pd.DataFrame()

for file in buy_files:
    df = pd.read_csv(file)
    base = os.path.basename(file)
    date = base[-11:-4] # 2024_02 transform later if needed
    df['date']=date
    buy_df = pd.concat([buy_df, df])

for file in rent_files:
    df = pd.read_csv(file)
    base = os.path.basename(file)
    date = base[-11:-4] # 2024_02 transform later if needed
    df['date']=date
    rent_df = pd.concat([rent_df, df])

buy_df.to_csv('buy.csv', index=False)
rent_df.to_csv('rent.csv', index=False)

# Experiment 0 - checking how points of interest affect prices

In [None]:
buy_df = pd.read_csv('buy.csv')
rent_df = pd.read_csv('rent.csv')

tested_features = ['poiCount','schoolDistance','clinicDistance','postOfficeDistance','kindergartenDistance','restaurantDistance','collegeDistance','pharmacyDistance', 'price']

correlation_buy = buy_df[tested_features].corr()
correlation_rent = rent_df[tested_features].corr()

print('Correlation matrix for buy \n',correlation_buy['price'],sep='')
print('Correlation matrix for rent \n',correlation_rent['price'],sep='')

wroclaw_buy_df = buy_df[buy_df['city']=='wroclaw']
wroclaw_rent_df = rent_df[rent_df['city']=='wroclaw']
correlation_buy = wroclaw_buy_df[tested_features].corr()
correlation_rent = wroclaw_rent_df[tested_features].corr()

print('Correlation matrix for buy in wroclaw \n',correlation_buy['price'], sep='')
print('Correlation matrix for rent in wroclaw \n',correlation_rent['price'],sep='')