## Import libraries

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

Storing the data in a dataframe

In [4]:
df = pd.read_parquet("DKHousingPrices.parquet")

In [5]:
print(df["date"].min(), "to", df["date"].max())
df.head()

1992-01-05 00:00:00 to 2024-10-26 00:00:00


Unnamed: 0,date,quarter,house_id,house_type,sales_type,year_build,purchase_price,%_change_between_offer_and_purchase,no_rooms,sqm,sqm_price,address,zip_code,city,area,region,nom_interest_rate%,dk_ann_infl_rate%,yield_on_mortgage_credit_bonds%
0,2024-10-26,2024Q4,0,Villa,regular_sale,1974,4350000,0.0,5,215.0,20232.558594,Kildevangen 5,8382,Hinnerup,East & mid jutland,Jutland,3.1,,
1,2024-10-26,2024Q4,2,Summerhouse,regular_sale,1956,450000,0.0,3,36.0,12500.0,Lykkestien 2,4400,Kalundborg,Other islands,Zealand,3.1,,
2,2024-10-26,2024Q4,1,Farm,regular_sale,1955,6600000,0.0,3,180.0,36666.667969,Sæderupvej 58,9260,Gistrup,North jutland,Jutland,3.1,,
3,2024-10-25,2024Q4,3,Apartment,family_sale,1945,1495000,0.0,2,64.0,23359.375,"Tage-Hansens Gade 5, 1. tv",8000,Aarhus C,East & mid jutland,Jutland,3.1,,
4,2024-10-25,2024Q4,4,Villa,regular_sale,1967,3375000,0.0,5,176.0,19176.136719,Chr.Winthers Vej 5,8600,Silkeborg,East & mid jutland,Jutland,3.1,,


In [6]:
#Drop all rows with na in the sqm column
df = df.dropna(subset=["city", "quarter","house_type", "sales_type", "year_build", "no_rooms", "sqm", "area", "dk_ann_infl_rate%", "nom_interest_rate%", "yield_on_mortgage_credit_bonds%"])
df.head()

Unnamed: 0,date,quarter,house_id,house_type,sales_type,year_build,purchase_price,%_change_between_offer_and_purchase,no_rooms,sqm,sqm_price,address,zip_code,city,area,region,nom_interest_rate%,dk_ann_infl_rate%,yield_on_mortgage_credit_bonds%
1193,2024-09-30,2024Q3,1300,Apartment,regular_sale,1971,1765000,0.0,3,78.0,22628.205078,"Hedekæret 38, 1. th",2640,Hedehusene,"Capital, Copenhagen",Zealand,3.35,1.13,4.34
1194,2024-09-30,2024Q3,1307,Summerhouse,regular_sale,2009,590939,0.0,3,50.0,11818.780273,Violstien 11,2635,Ishøj,"Capital, Copenhagen",Zealand,3.35,1.13,4.34
1195,2024-09-30,2024Q3,1301,Apartment,regular_sale,1940,1750000,0.0,2,56.0,31250.0,"Buddingevej 72I, st. tv",2800,Kongens Lyngby,"Capital, Copenhagen",Zealand,3.35,1.13,4.34
1196,2024-09-30,2024Q3,1302,Summerhouse,family_sale,1950,1080000,0.0,4,72.0,15000.0,Frederiksvej 35,3730,Nexø,Bornholm,Bornholm,3.35,1.13,4.34
1197,2024-09-30,2024Q3,1303,Apartment,regular_sale,1974,2300000,0.0,1,50.0,46000.0,"Thyrasgade 4, 4. 508",2200,København N,"Capital, Copenhagen",Zealand,3.35,1.13,4.34


Quarter skal laves til en float istedet for datetime

In [7]:
# Example: Convert a Period column to the number of days/months/years since a reference date
df["quarter"] = df["quarter"].astype("int64")  # or .dt.to_timestamp().astype("int64")
df.head()


Unnamed: 0,date,quarter,house_id,house_type,sales_type,year_build,purchase_price,%_change_between_offer_and_purchase,no_rooms,sqm,sqm_price,address,zip_code,city,area,region,nom_interest_rate%,dk_ann_infl_rate%,yield_on_mortgage_credit_bonds%
1193,2024-09-30,218,1300,Apartment,regular_sale,1971,1765000,0.0,3,78.0,22628.205078,"Hedekæret 38, 1. th",2640,Hedehusene,"Capital, Copenhagen",Zealand,3.35,1.13,4.34
1194,2024-09-30,218,1307,Summerhouse,regular_sale,2009,590939,0.0,3,50.0,11818.780273,Violstien 11,2635,Ishøj,"Capital, Copenhagen",Zealand,3.35,1.13,4.34
1195,2024-09-30,218,1301,Apartment,regular_sale,1940,1750000,0.0,2,56.0,31250.0,"Buddingevej 72I, st. tv",2800,Kongens Lyngby,"Capital, Copenhagen",Zealand,3.35,1.13,4.34
1196,2024-09-30,218,1302,Summerhouse,family_sale,1950,1080000,0.0,4,72.0,15000.0,Frederiksvej 35,3730,Nexø,Bornholm,Bornholm,3.35,1.13,4.34
1197,2024-09-30,218,1303,Apartment,regular_sale,1974,2300000,0.0,1,50.0,46000.0,"Thyrasgade 4, 4. 508",2200,København N,"Capital, Copenhagen",Zealand,3.35,1.13,4.34


### Make dummy variables

In [8]:
X = pd.get_dummies(df[["quarter","house_type", "sales_type", "year_build", "no_rooms", "sqm", "area", "nom_interest_rate%" ]])
y = df["purchase_price"]
X.shape, y.shape
X.head()

Unnamed: 0,quarter,year_build,no_rooms,sqm,nom_interest_rate%,house_type_Apartment,house_type_Farm,house_type_Summerhouse,house_type_Townhouse,house_type_Villa,...,sales_type_other_sale,sales_type_regular_sale,area_Bornholm,"area_Capital, Copenhagen",area_East & mid jutland,area_Fyn & islands,area_North Zealand,area_North jutland,area_Other islands,area_South jutland
1193,218,1971,3,78.0,3.35,True,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False
1194,218,2009,3,50.0,3.35,False,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False
1195,218,1940,2,56.0,3.35,True,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False
1196,218,1950,4,72.0,3.35,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
1197,218,1974,1,50.0,3.35,True,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False


Make the training and testing split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# Create and fit the model
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)

In [11]:
#Evaluate the model
print("Score on training set: {:.3f}".format(simple_model.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(simple_model.score(X_test, y_test)))

Score on training set: 0.337
Score on test set: 0.336
