# Preparing the data
Combining data into two datasets, one that contains all rent prices and one that contains all buy prices.

In [None]:
import pandas as pd
import glob
import os

buy_files = glob.glob(os.path.join("./dataset/", "apartments_pl*"))
rent_files = glob.glob(os.path.join("./dataset/", "apartments_rent*"))

buy_df = pd.DataFrame()
rent_df = pd.DataFrame()

for file in buy_files:
    df = pd.read_csv(file)
    base = os.path.basename(file)
    date = base[-11:-4]  # 2024_02 transform later if needed
    df["date"] = date
    buy_df = pd.concat([buy_df, df])

for file in rent_files:
    df = pd.read_csv(file)
    base = os.path.basename(file)
    date = base[-11:-4]  # 2024_02 transform later if needed
    df["date"] = date
    rent_df = pd.concat([rent_df, df])

buy_df.to_csv("buy.csv", index=False)
rent_df.to_csv("rent.csv", index=False)

In [None]:
buy_df = pd.read_csv("buy.csv")
rent_df = pd.read_csv("rent.csv")

tested_features = [
    "poiCount",
    "schoolDistance",
    "clinicDistance",
    "postOfficeDistance",
    "kindergartenDistance",
    "restaurantDistance",
    "collegeDistance",
    "pharmacyDistance",
    "price",
]

correlation_buy = buy_df[tested_features].corr()
correlation_rent = rent_df[tested_features].corr()

print("Correlation matrix for buy", correlation_buy["price"], sep="\n")
print("Correlation matrix for rent", correlation_rent["price"], sep="\n")

for city in buy_df["city"].unique():
    correlation_buy = buy_df[buy_df["city"] == city][tested_features].corr()
    print(f"Correlation matrix for buy in {city}", correlation_buy["price"], sep="\n")

for city in rent_df["city"].unique():
    correlation_rent = rent_df[rent_df["city"] == city][tested_features].corr()
    print(f"Correlation matrix for rent in {city}", correlation_rent["price"], sep="\n")

Correlation matrix for buy 
poiCount                0.184639
schoolDistance         -0.000032
clinicDistance         -0.107165
postOfficeDistance     -0.011941
kindergartenDistance   -0.016408
restaurantDistance     -0.105686
collegeDistance        -0.014567
pharmacyDistance       -0.013712
price                   1.000000
Name: price, dtype: float64
Correlation matrix for rent 
poiCount                0.180689
schoolDistance         -0.070784
clinicDistance         -0.146804
postOfficeDistance     -0.057638
kindergartenDistance   -0.065827
restaurantDistance     -0.155015
collegeDistance        -0.086960
pharmacyDistance       -0.069603
price                   1.000000
Name: price, dtype: float64
Correlation matrix for buy in szczecin 
poiCount               -0.005292
schoolDistance          0.088632
clinicDistance          0.050508
postOfficeDistance      0.024738
kindergartenDistance    0.011931
restaurantDistance     -0.015733
collegeDistance         0.057295
pharmacyDistance      