In [None]:
import sqlite3
import pandas as pd
import logging
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from catboost import CatBoostRegressor


date_now = dt.datetime.now()
date_now = date_now.strftime("%y%m%d")


# Konfiguration av loggning
logging.basicConfig(level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    filename=f"fastighet_ml_{date_now}.log", # Loggfil
    filemode="a") # Append-läge

conn = sqlite3.connect("fastigheter.db")
types = ["fritidshus", "gård", "hus", "kedjehus", "lägenhet", "parhus", "radhus", "tomt_mark", "villa", "övrigt"]

dataframes = {}

logging.info("Startar hämtning från SQLdatabas")

for typ in types:
    table_name = f"fastighetstyp_{typ}"
    try:
        df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
        dataframes[typ] = df
        logging.info(f"Hämtade {len(df)} rader från tabellen '{table_name}'")
    except Exception as e:
        print(f"Kunde inte hämta tabellen '{table_name}': {e}")
        logging.error(f"Kunde inte hämta tabellen '{table_name}': {e}")


# Sätter Biarea till 0 om inget värde finns, då antas den vara 0
for typ, df in dataframes.items():
    if "Biarea" in df.columns:
        df["Biarea"] = df["Biarea"].where(df["Biarea"].notna(), 0)
        
logging.info("Sätter NaN-värden i Biarea till 0")

# Formaterar datum efter YYMMDD
for typ, df in dataframes.items():
    if "Datum" in df.columns:
        # Konvertera till datetime om det inte redan är det
        datum_antal = len(df["Datum"])
        df["Datum"] = pd.to_datetime(df["Datum"], errors="coerce")

        # Formatera till YYMMDD som sträng
        df["Datum"] = df["Datum"].dt.strftime("%y%m%d")

        logging.info(f"{typ}: Datum-kolumnen formaterad till YYMMDD, {datum_antal} st")
    else:
        logging.info(f"{typ}: Ingen 'Datum'-kolumn hittades, ingen formatering gjord")


In [2]:
# Slå ihop kedjehus och parhus till radhus
if "radhus" in dataframes:
    frames_to_add = []
    for subtyp in ["kedjehus", "parhus"]:
        if subtyp in dataframes:
            frames_to_add.append(dataframes[subtyp])

    if frames_to_add:  # bara om något hittades
        dataframes["radhus"] = pd.concat(
            [dataframes["radhus"]] + frames_to_add,
            axis=0,
            ignore_index=True)

        logging.info(
            f"Slagit ihop {' och '.join(['kedjehus','parhus'])} i 'radhus'. "
            f"Nytt antal rader: {len(dataframes['radhus'])}")

#Ta bort hus, den innehåller bara skräpdata och kedjehus och parhus som lagts till i radhus. Tar bort övrigt och tomt.
del dataframes["hus"]
del dataframes["kedjehus"]
del dataframes["parhus"]
del dataframes["övrigt"]
del dataframes["tomt_mark"]
logging.info("Raderar dataframes, 'hus', 'kedjehus', 'parhus', 'övrigt', 'tomt_mark'")

In [3]:
# Gör om nollvärden till NaN i boarea och rum
for typ, df in dataframes.items():
    if "Boarea" in df.columns:
        antal_noll = (df["Boarea"] == 0).sum()
        df["Boarea"] = df["Boarea"].replace(0, pd.NA)
        logging.info(f"{typ}: {antal_noll} rader med 0 i 'Boarea' ändrades till NaN")
    if "Rum" in df.columns:
        antal_noll = (df["Rum"] == 0).sum()
        df["Rum"] = df["Rum"].replace(0, pd.NA)
        logging.info(f"{typ}: {antal_noll} rader med 0 i 'Rum' ändrades till NaN")

# Tar bort värden som har NaN på både rum och boarea
for typ, df in dataframes.items():
    if "Boarea" in df.columns and "Rum" in df.columns:
        antal_tomma = df[df["Boarea"].isna() & df["Rum"].isna()].shape[0]
        df.dropna(subset=["Boarea", "Rum"], how="all", inplace=True)
        logging.info(f"{typ}: {antal_tomma} rader med NaN i både 'Boarea' och 'Rum' tas bort")
    else:
        logging.info(f"{typ}: Kolumnerna 'Boarea' och/eller 'Rum' saknas, inga rader borttagna")

# Tar bort rader med NaN i Tomtarea för villa
logging.info(f"Tar bort {(dataframes['villa']['Tomtarea'].isna()).sum()} rader med NaN-värden från 'Tomtarea' i 'villa'")
dataframes["villa"].dropna(subset=["Tomtarea"], inplace=True)

In [4]:
# Tar bort kolumner då de är irrelevanta eller saknar data
dataframes["fritidshus"].drop(columns="Våning", inplace=True) # på ursprungliga objektet
dataframes["gård"].drop(columns="Våning", inplace=True) 
dataframes["radhus"].drop(columns="Våning", inplace=True) 
dataframes["villa"].drop(columns="Våning", inplace=True) 
logging.info("Tar bort 'Våning' från 'fritidshus', 'gård', 'radhus' och 'villa'")

dataframes["lägenhet"].drop(columns="Tomtarea", inplace=True)
dataframes["lägenhet"].drop(columns="Biarea", inplace=True)
logging.info("Tar bort 'Tomtarea' och 'Biarea' från 'lägenhet'")

### En modell som uppskattar antal Rum baserat på Boarea och sen en som uppskattar Boarea på antal rum och fyller i NaN värden i Rum och Boarea med prediktioner


In [5]:
# Prediktera antal rum och ersätt alla NaN värden med prediktion
df_villa = dataframes["villa"]

# Separera kända och okända värden för 'Boarea'
known = df_villa[df_villa["Rum"].notna() & df_villa["Boarea"].notna()].copy()
unknown = df_villa[df_villa["Rum"].isna() & df_villa["Boarea"].notna()].copy()

logging.info(f"Antal rader med känt antal rum: {len(known)}")
logging.info(f"Antal rader med NaN i rum: {len(unknown)}")

# Features = endast boarea
X = known[["Boarea"]]
y = known["Rum"]

# Träna modell
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11)

model = RandomForestRegressor(n_estimators=200, random_state=11)
model.fit(X_train, y_train)

r2_score = model.score(X_test, y_test)
logging.info(f"Tränade RandomForestRegressor för 'Rum' baserat på 'Boarea'. R² på testdata: {r2_score:.3f}")

# Prediktera för okända
if not unknown.empty:
    predicted_rum = model.predict(unknown[["Boarea"]])
    # Avrunda till närmaste halvtal
    predicted_rum = np.round(predicted_rum * 2) / 2
    # Skriv tillbaka i dataframe
    df_villa.loc[df_villa["Rum"].isna() & df_villa["Boarea"].notna(), "Rum"] = predicted_rum
    logging.info(f"Fyllde i {len(predicted_rum)} NaN-värden i 'Rum' med predikterade värden (avrundat till halvtal).")
else:
    logging.info("Inga NaN-värden i 'Rum' att fylla i.")

# Uppdatera i dictionaryn
dataframes["villa"] = df_villa

# # Prediktera storlek i kvm och ersätt alla NaN värden med prediktion
df_villa_kvm = dataframes["villa"]

# Separera kända och okända värden för 'Boarea'
known_kvm = df_villa_kvm[df_villa_kvm["Boarea"].notna() & df_villa_kvm["Rum"].notna()].copy()
unknown_kvm = df_villa_kvm[df_villa_kvm["Boarea"].isna() & df_villa_kvm["Rum"].notna()].copy()

logging.info(f"Antal rader med känd boarea: {len(known_kvm)}")
logging.info(f"Antal rader med NaN i boarea: {len(unknown_kvm)}")

# Features = endast rum
X_kvm = known_kvm[["Rum"]]
y_kvm = known_kvm["Boarea"]

# Träna modell
X_train, X_test, y_train, y_test = train_test_split(
    X_kvm, y_kvm, test_size=0.2, random_state=11)

model_kvm = RandomForestRegressor(n_estimators=200, random_state=11)
model_kvm.fit(X_train, y_train)

r2_score_kvm = model_kvm.score(X_test, y_test)
logging.info(f"Tränade RandomForestRegressor för 'Boarea' baserat på 'Rum'. R² på testdata: {r2_score_kvm:.3f}")

# Prediktera för okända
if not unknown_kvm.empty:
    predicted_kvm = model_kvm.predict(unknown_kvm[["Rum"]])
    predicted_kvm = np.round(predicted_kvm) # Avrunda till närmaste heltal

# Skriv tillbaka i dataframe
    df_villa_kvm.loc[df_villa_kvm["Boarea"].isna() & df_villa_kvm["Rum"].notna(), "Boarea"] = predicted_kvm

    logging.info(f"Fyllde i {len(predicted_kvm)} NaN-värden i 'Boarea' med predikterade värden (avrundat till heltal).")
else:
    logging.info("Inga NaN-värden i 'Boarea' att fylla i för villa.")

# Uppdatera i dictionaryn
dataframes["villa"] = df_villa_kvm

### Avnänder modellen för villa och itererar över fritidshus, gård och radhus

In [6]:
for typ in ["fritidshus", "gård", "radhus"]:
    df = dataframes[typ]

    # Fyll i saknade 'Rum' baserat på 'Boarea'
    known_rum = df[df["Rum"].notna() & df["Boarea"].notna()].copy()
    unknown_rum = df[df["Rum"].isna() & df["Boarea"].notna()].copy()

    logging.info(f"{typ}: {len(known_rum)} rader med känd boarea")
    logging.info(f"{typ}: {len(unknown_rum)} rader med NaN i 'Rum'")

    X_rum = known_rum[["Boarea"]]
    y_rum = known_rum["Rum"]
    r2_rum = model.score(X_rum, y_rum)
    logging.info(f"{typ}: R² för prediktion av 'Rum' från 'Boarea': {r2_rum:.3f}")

    if not unknown_rum.empty:
        predicted_rum = model.predict(unknown_rum[["Boarea"]])
        predicted_rum = np.round(predicted_rum * 2) / 2
        df.loc[df["Rum"].isna() & df["Boarea"].notna(), "Rum"] = predicted_rum
        logging.info(f"{typ}: Fyllde i {len(predicted_rum)} saknade 'Rum'-värden")

    # Fyll i saknade 'Boarea' baserat på 'Rum'
    known_boarea = df[df["Boarea"].notna() & df["Rum"].notna()].copy()
    unknown_boarea = df[df["Boarea"].isna() & df["Rum"].notna()].copy()

    logging.info(f"{typ}: {len(known_boarea)} rader med känt antal rum")
    logging.info(f"{typ}: {len(unknown_boarea)} rader med NaN i 'Boarea'")

    X_boarea = known_boarea[["Rum"]]
    y_boarea = known_boarea["Boarea"]
    r2_boarea = model_kvm.score(X_boarea, y_boarea)
    logging.info(f"{typ}: R² för prediktion av 'Boarea' från 'Rum': {r2_boarea:.3f}")

    if not unknown_boarea.empty:
        predicted_boarea = model_kvm.predict(unknown_boarea[["Rum"]])
        predicted_boarea = np.round(predicted_boarea)
        df.loc[df["Boarea"].isna() & df["Rum"].notna(), "Boarea"] = predicted_boarea
        logging.info(f"{typ}: Fyllde i {len(predicted_boarea)} saknade 'Boarea'-värden")

    # Uppdaterar dataframen
    dataframes[typ] = df


In [7]:
# Städar upp bland datan
# Tar bort rader med NaN i 'Våning' i lägenheter, tillräckligt många datapunkter finns ändå
logging.info(f"lägenhet: Tar bort {(dataframes['lägenhet']['Våning'].isna()).sum()} "
"rader som har NaN i 'Våning'")
dataframes["lägenhet"].dropna(subset=["Våning"], inplace=True)

# Tar bort rader med NaN i 'Boarea' och 'Rum' i 'lägenhet'
logging.info(f"lägenhet: Tar bort {(dataframes['lägenhet']['Boarea'].isna()).sum()} "
"rader med NaN i 'Boarea'")
dataframes["lägenhet"].dropna(subset=["Boarea"], inplace=True)
logging.info(f"lägenhet: Tar bort {(dataframes['lägenhet']['Rum'].isna()).sum()} "
"rader med NaN i 'Rum'")
dataframes["lägenhet"].dropna(subset=["Rum"], inplace=True)

# Tar bort rader med NaN i 'Tomtarea' i 'gård'
logging.info(f"radhus: Tar bort {(dataframes['gård']['Tomtarea'].isna()).sum()} "
"rader som har NaN i 'Tomtarea'")
dataframes["gård"].dropna(subset=["Tomtarea"], inplace=True)

# Tar bort rader med NaN i 'Tomtarea' i 'radhus'
logging.info(f"radhus: Tar bort {(dataframes['radhus']['Tomtarea'].isna()).sum()} "
"rader som har NaN i 'Tomtarea'")
dataframes["radhus"].dropna(subset=["Tomtarea"], inplace=True)

# Ersätt ',' med '.' och konverterar 'Våning' till numerisk och avrundar
dataframes["lägenhet"]["Våning"] = dataframes["lägenhet"]["Våning"].astype(str).str.replace(",", ".", regex=False)
dataframes["lägenhet"]["Våning"] = pd.to_numeric(dataframes["lägenhet"]["Våning"], errors="coerce")
dataframes["lägenhet"]["Våning"] = dataframes["lägenhet"]["Våning"].round(0).astype("Int64")
logging.info("Konverterar 'Våning' i 'lägenhet' till numerisk och avrundar")

In [None]:
# Tar bort decimaler och konverterar till tal
for namn, df in dataframes.items():
    for kolumn in ["Tomtarea", "Boarea", "Biarea"]:
        if kolumn in df.columns:
            # Ersätt ',' med '.' och konvertera till numerisk
            df[kolumn] = df[kolumn].astype(str).str.replace(",", ".", regex=False)
            df[kolumn] = pd.to_numeric(df[kolumn], errors="coerce")
            
            # Avrunda och konvertera till Int64
            df[kolumn] = df[kolumn].round(0).astype("Int64")  # "Int64" tål NaN
            logging.info(f"Konverterar '{kolumn}' i '{namn}' till numerisk och avrundar")

    # Skriv tillbaka uppdaterad dataframe
    dataframes[namn] = df

In [9]:
# Itererar över alla df och tar bort alla fastigheter där priset avviker mer än 
# 50% från snittet på gatan för att få bort problem med försäljningar inom familj etc
for namn, df in dataframes.items():
    if "Adress" in df.columns and "Ort" in df.columns and "Pris" in df.columns:
        # Räkna ut medelpris per (Adress, Ort)
        mean_prices = df.groupby(["Adress", "Ort"])["Pris"].transform("mean")

        # Mask: vilka rader skiljer sig mer än 50% från snittet
        mask_outliers = (df["Pris"] > 1.5 * mean_prices) | (df["Pris"] < 0.5 * mean_prices)
        # Räkna hur många som tas bort
        antal_bort = mask_outliers.sum()
        # Droppa outliers
        df = df.loc[~mask_outliers].copy()
        # Logga resultat
        logging.info(f"I '{namn}' togs {antal_bort} rader bort då priset skiljde sig mer än 50% från snittet för samma Adress/Ort.")

        # Uppdatera i dictionary
        dataframes[namn] = df

In [10]:
#Cat boost med logaritmiska priser
from sklearn.metrics import r2_score, mean_squared_error # Deklarerar om

# Förbered data
lower_lg_villa = dataframes["villa"]["Pris"].quantile(0.05)
upper_lg_villa = dataframes["villa"]["Pris"].quantile(0.95)

# Filtrera bort outliers
df_filtered_lg_villa = dataframes["villa"][(dataframes["villa"]["Pris"] > lower_lg_villa) & (dataframes["villa"]["Pris"] < upper_lg_villa)].copy()

# Skapa log-transformerad target
df_filtered_lg_villa["log_pris"] = np.log1p(df_filtered_lg_villa["Pris"])
target = "log_pris"

# Ta bort kolumner som inte ska vara med
drop_cols_lg_villa = [col for col in ["Pris", "Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_filtered_lg_villa.columns]

X_lg_villa = df_filtered_lg_villa.drop(columns=drop_cols_lg_villa + [target])  # ta bort både Pris och log_pris
y_lg_villa = df_filtered_lg_villa[target]

# Identifiera kategoriska kolumner
cat_features_lg_villa = X_lg_villa.select_dtypes(include=["object", "category"]).columns.tolist()

logging.info(f"Kategoriska variabler som används: {cat_features_lg_villa}")

# Train/test split
X_lg_villa_train, X_lg_villa_test, y_lg_villa_train, y_lg_villa_test = train_test_split(
    X_lg_villa, y_lg_villa, test_size=0.2, random_state=11)

# CatBoost-modell
model = CatBoostRegressor(
    iterations=2000,
    depth=8,
    learning_rate=0.0422,
    loss_function="RMSE",
    cat_features=cat_features_lg_villa,
    verbose=100,
    bagging_temperature=5,
    l2_leaf_reg=5)

# Träna modellen
model.fit(X_lg_villa_train, y_lg_villa_train, eval_set=(X_lg_villa_test, y_lg_villa_test), use_best_model=True)

# Utvärdera
y_lg_villa_pred_train = np.expm1(model.predict(X_lg_villa_train))
y_lg_villa_pred_test  = np.expm1(model.predict(X_lg_villa_test))

r2_train_lg_villa = r2_score(np.expm1(y_lg_villa_train), y_lg_villa_pred_train)
r2_test_lg_villa  = r2_score(np.expm1(y_lg_villa_test), y_lg_villa_pred_test)

rmse_train_lg_villa = np.sqrt(mean_squared_error(np.expm1(y_lg_villa_train), y_lg_villa_pred_train))
rmse_test_lg_villa  = np.sqrt(mean_squared_error(np.expm1(y_lg_villa_test), y_lg_villa_pred_test))

logging.info(f"CatBoost tränad på villa-data med log(pris). "
    f"R² train: {r2_train_lg_villa:.3f}, test: {r2_test_lg_villa:.3f}, "
    f"RMSE train: {rmse_train_lg_villa:.0f}, test: {rmse_test_lg_villa:.0f}")

# Feature importance
feature_importances_lg_villa = model.get_feature_importance()
feature_names_lg_villa = np.array(X_lg_villa.columns)

sorted_idx_lg_villa = feature_importances_lg_villa.argsort()

# Logga topp 5 features
top_n_lg_villa = 5
top_idx_lg_villa = feature_importances_lg_villa.argsort()[::-1][:top_n_lg_villa]
top_features_lg_villa = [(feature_names_lg_villa[i], feature_importances_lg_villa[i]) for i in top_idx_lg_villa]

logging.info("Topp 5 viktigaste features för pris:")
for name, importance in top_features_lg_villa:
    logging.info(f"- {name}: {importance:.2f}")

0:	learn: 0.6513406	test: 0.6483214	best: 0.6483214 (0)	total: 253ms	remaining: 8m 25s
100:	learn: 0.3850571	test: 0.3688453	best: 0.3688453 (100)	total: 8.72s	remaining: 2m 44s
200:	learn: 0.3769856	test: 0.3623147	best: 0.3623147 (200)	total: 17.8s	remaining: 2m 39s
300:	learn: 0.3720300	test: 0.3592303	best: 0.3592303 (300)	total: 26.7s	remaining: 2m 30s
400:	learn: 0.3678998	test: 0.3571904	best: 0.3571904 (400)	total: 36.2s	remaining: 2m 24s
500:	learn: 0.3642940	test: 0.3555797	best: 0.3555797 (500)	total: 45.9s	remaining: 2m 17s
600:	learn: 0.3612624	test: 0.3544194	best: 0.3544194 (600)	total: 54.5s	remaining: 2m 6s
700:	learn: 0.3585055	test: 0.3536507	best: 0.3536488 (698)	total: 1m 3s	remaining: 1m 57s
800:	learn: 0.3556971	test: 0.3528562	best: 0.3528562 (800)	total: 1m 13s	remaining: 1m 49s
900:	learn: 0.3532176	test: 0.3524555	best: 0.3524555 (900)	total: 1m 22s	remaining: 1m 40s
1000:	learn: 0.3507780	test: 0.3520110	best: 0.3520094 (999)	total: 1m 32s	remaining: 1m 31s


In [None]:
# Cat boost med Grid searchade parametrar
from sklearn.metrics import r2_score, mean_squared_error # Deklarerar om

lower_cb_gs_villa = dataframes["villa"]["Pris"].quantile(0.05)
upper_cb_gs_villa = dataframes["villa"]["Pris"].quantile(0.95)

# Filtrera bort rader utanför percentilgränserna
df_filtered_cb_gs_villa = dataframes["villa"][(dataframes["villa"]["Pris"] > lower_cb_gs_villa) & (dataframes["villa"]["Pris"] < upper_cb_gs_villa)]

# Välj dataframe
df_cb_cb_gs_villa = df_filtered_cb_gs_villa.copy()

# Målvariabeln heter 'Pris'
target_cb_gs_villa = "Pris"

# Ta bort "Datum" och "Bostadstyp" om de finns
drop_cols_cb_gs_villa = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_cb_cb_gs_villa.columns]
X_cb_gs_villa = df_cb_cb_gs_villa.drop(columns=[target_cb_gs_villa] + drop_cols_cb_gs_villa)
y_cb_gs_villa = df_cb_cb_gs_villa[target_cb_gs_villa]

# Identifiera kategoriska kolumner
cat_features_cb_gs_villa = X_cb_gs_villa.select_dtypes(include=["object", "category"]).columns.tolist()

logging.info(f"Kategoriska variabler som används: {cat_features_cb_gs_villa}")

# Train/test split
X_cb_gs_villa_train, X_cb_gs_villa_test, y_cb_gs_villa_train, y_cb_gs_villa_test = train_test_split(
    X_cb_gs_villa, y_cb_gs_villa, test_size=0.2, random_state=11)

# CatBoost-modell
model_cb_gs_villa = CatBoostRegressor(
    iterations=2000, #5000
    depth=8,
    learning_rate=0.042222222222222223, #0,01 gav 0,818 o 0,725
    loss_function="RMSE",
    cat_features=cat_features_cb_gs_villa,
    verbose=100,
    bagging_temperature=5,
    l2_leaf_reg=5)

# Träna modellen
model_cb_gs_villa.fit(X_cb_gs_villa_train, y_cb_gs_villa_train, eval_set=(X_cb_gs_villa_test, y_cb_gs_villa_test), use_best_model=True)

# Utvärdera
r2_train_cb_gs_villa = model_cb_gs_villa.score(X_cb_gs_villa_train, y_cb_gs_villa_train)
r2_test_cb_gs_villa = model_cb_gs_villa.score(X_cb_gs_villa_test, y_cb_gs_villa_test)

# Prediktioner
y_pred_train = model_cb_gs_villa.predict(X_cb_gs_villa_train)
y_pred_test = model_cb_gs_villa.predict(X_cb_gs_villa_test)

# RMSE
rmse_train_cb_gs_villa = np.sqrt(mean_squared_error(y_cb_gs_villa_train, y_pred_train))
rmse_test_cb_gs_villa = np.sqrt(mean_squared_error(y_cb_gs_villa_test, y_pred_test))

# MAPE
mape_train_cb_gs_villa = mean_absolute_percentage_error(y_cb_gs_villa_train, y_pred_train) * 100
mape_test_cb_gs_villa = mean_absolute_percentage_error(y_cb_gs_villa_test, y_pred_test) * 100

logging.info(f"CatBoost tränad på villa-data. "
    f"R² train: {r2_train_cb_gs_villa:.3f}, test: {r2_test_cb_gs_villa:.3f}, "
    f"RMSE train: {rmse_train_cb_gs_villa:.0f}, test: {rmse_test_cb_gs_villa:.0f}")
logging.info(f"CatBoost MAPE train: {mape_train_cb_gs_villa:.2f}%, test: {mape_test_cb_gs_villa:.2f}%")

# Feature importance
feature_importances_cb_gs_villa = model_cb_gs_villa.get_feature_importance()
feature_names_cb_gs_villa = np.array(X_cb_gs_villa.columns)

# Sortera efter betydelse
sorted_idx_cb_gs_villa = feature_importances_cb_gs_villa.argsort()

# Logga topp 5 features
top_n_cb_gs_villa = 5
top_idx_cb_gs_villa = feature_importances_cb_gs_villa.argsort()[::-1][:top_n_cb_gs_villa]  # index på topp N
top_features_cb_gs_villa = [(feature_names_cb_gs_villa[i], feature_importances_cb_gs_villa[i]) for i in top_idx_cb_gs_villa]

logging.info("Topp 5 viktigaste features för pris:")
for name, importance in top_features_cb_gs_villa:
    logging.info(f"- {name}: {importance:.2f}")

# Spara modellen
joblib.dump(model_cb_gs_villa, "villa_cb_gs.pkl")

logging.info("Modellen är sparad som 'villa_cb_gs.pkl'")

0:	learn: 2027836.7392046	test: 2017802.8606597	best: 2017802.8606597 (0)	total: 92ms	remaining: 3m 3s
100:	learn: 1172561.9879582	test: 1123003.9150008	best: 1123003.9150008 (100)	total: 8.25s	remaining: 2m 35s
200:	learn: 1138684.3662493	test: 1092996.5449711	best: 1092996.5449711 (200)	total: 17.8s	remaining: 2m 39s
300:	learn: 1119839.4045053	test: 1081380.3116968	best: 1081380.3116968 (300)	total: 26.5s	remaining: 2m 29s
400:	learn: 1105188.9548111	test: 1074342.3464419	best: 1074342.3464419 (400)	total: 35.8s	remaining: 2m 22s
500:	learn: 1093438.5275343	test: 1069264.2627370	best: 1069264.2627370 (500)	total: 44.8s	remaining: 2m 14s
600:	learn: 1082314.0933192	test: 1065168.3603374	best: 1065168.3603374 (600)	total: 54.1s	remaining: 2m 6s
700:	learn: 1072329.5001840	test: 1062072.1986821	best: 1062072.1986821 (700)	total: 1m 4s	remaining: 1m 58s
800:	learn: 1064433.6241152	test: 1060168.6120755	best: 1060162.1682215 (798)	total: 1m 13s	remaining: 1m 50s
900:	learn: 1057178.32633

In [None]:
# Randomized search villa
from sklearn.metrics import mean_squared_error, r2_score

# Filtrera data (exempel med "villa")
lower_gs_villa = dataframes["villa"]["Pris"].quantile(0.05)
upper_gs_villa = dataframes["villa"]["Pris"].quantile(0.95)
df_gs_villa = dataframes["villa"][(dataframes["villa"]["Pris"]
    > lower_gs_villa) & (dataframes["villa"]["Pris"] < upper_gs_villa)].copy()

# target_gs_villa
target_gs_villa = "Pris"

# Drop-kolumner (om de finns)
drop_cols_gs_villa = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_gs_villa.columns]

# Features & target_gs_villa
X_gs_villa = df_gs_villa.drop(columns=[target_gs_villa] + drop_cols_gs_villa)
y_gs_villa = df_gs_villa[target_gs_villa]

# Kategoriska features
cat_features_gs_villa = X_gs_villa.select_dtypes(include=["object", "category"]).columns.tolist()

logging.info(f"Kategoriska variabler som används: {cat_features_gs_villa}")

# Train/test-split
X_gs_villa_train, X_gs_villa_test, y_gs_villa_train, y_gs_villa_test = train_test_split(X_gs_villa, y_gs_villa, test_size=0.2, random_state=11)

# Modell
cat_model = CatBoostRegressor(
    loss_function="RMSE",
    cat_features=cat_features_gs_villa,
    verbose=0,
    random_state=11)

# Parameterutrymme
param_dist = {
    "iterations": [500, 1000, 2000, 3000, 5000],
    "depth": [4, 6, 8, 10],
    "learning_rate": np.linspace(0.01, 0.3, 10),
    "l2_leaf_reg": [1, 3, 5, 7, 10],
    "bagging_temperature": [0, 0.5, 1, 2, 3, 5]}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error", # Optimera RMSE
    cv=3,
    verbose=2,
    random_state=11,
    n_jobs=-1)

logging.info("Startar RandomizedSearchCV för CatBoostRegressor...")

# Träna
random_search.fit(X_gs_villa_train, y_gs_villa_train)

# Resultat från CV
for i, params in enumerate(random_search.cv_results_["params"]):
    mean_score = random_search.cv_results_["mean_test_score"][i]
    std_score = random_search.cv_results_["std_test_score"][i]
    logging.info(f"Run {i+1}: Score={mean_score:.4f} (+/- {std_score:.4f}) | Params={params}")

# Bästa resultat
best_params = random_search.best_params_
best_score = random_search.best_score_

logging.info(f"Bästa parametrar: {best_params}")
logging.info(f"Bästa RMSE-score (cross val): {best_score:.4f}")

# Träna om bästa modellen
best_cat_model = random_search.best_estimator_
best_cat_model.fit(X_gs_villa_train, y_gs_villa_train, eval_set=(X_gs_villa_test, y_gs_villa_test), verbose=100)

# Utvärdera på testdatan
r2_test = best_cat_model.score(X_gs_villa_test, y_gs_villa_test)
logging.info(f"R² på testdata: {r2_test:.4f}")



Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
# Random forest
# Filtrera ut extremvärden
lower_rf_villa = dataframes["villa"]["Pris"].quantile(0.05)
upper_rf_villa = dataframes["villa"]["Pris"].quantile(0.95)

df_filtered_rf_villa = dataframes["villa"][(dataframes["villa"]["Pris"]
    > lower_rf_villa) & (dataframes["villa"]["Pris"] < upper_rf_villa)].copy()

# Features & target_rf_villa
target_rf_villa = "Pris"

drop_cols = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea", "Adress"] if col in df_filtered_rf_villa.columns]
X_rf_villa = df_filtered_rf_villa.drop(columns=[target_rf_villa] + drop_cols)
y_rf_villa = df_filtered_rf_villa[target_rf_villa]

# Gör log-transform på target_rf_villa
y_rf_villa_log = np.log1p(y_rf_villa)  # log(Pris + 1) för att undvika log(0)

# Gör dummy-variabler för kategoriska features
X_rf_villa = pd.get_dummies(X_rf_villa, drop_first=True)

# Train/test split
X_rf_villa_train, X_rf_villa_test, y_rf_villa_train, y_rf_villa_test = train_test_split(
    X_rf_villa, y_rf_villa_log, test_size=0.2, random_state=11)

# Definiera modellen
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=15,
    random_state=11,
    n_jobs=-1)

rf_model.fit(X_rf_villa_train, y_rf_villa_train)

# Utvärdera modellen
y_rf_villa_train_pred = rf_model.predict(X_rf_villa_train)
y_rf_villa_test_pred = rf_model.predict(X_rf_villa_test)

r2_train_rf_villa = r2_score(y_rf_villa_train, y_rf_villa_train_pred)
r2_test_rf_villa = r2_score(y_rf_villa_test, y_rf_villa_test_pred)

rmse_train_rf_villa = np.sqrt(mean_squared_error(y_rf_villa_train, y_rf_villa_train_pred))
rmse_test_rf_villa = np.sqrt(mean_squared_error(y_rf_villa_test, y_rf_villa_test_pred))

logging.info(f"RF på villa-data. R² train: {r2_train_rf_villa:.3f}, test: {r2_test_rf_villa:.3f}")
logging.info(f"RF på villa-data. RMSE train: {rmse_train_rf_villa:.3f}, test: {rmse_test_rf_villa:.3f}")


In [None]:
# LGBM model, villa
from sklearn.metrics import r2_score, mean_squared_error

# Filtrera ut extremvärden
lower_lgbm_villa = dataframes["villa"]["Pris"].quantile(0.05)
upper_lgbm_villa = dataframes["villa"]["Pris"].quantile(0.95)

df_filtered_lgbm_villa = dataframes["villa"][
    (dataframes["villa"]["Pris"] > lower_lgbm_villa) & (dataframes["villa"]["Pris"] < upper_lgbm_villa)].copy()

# Features & target
target_lgbm_villa = "Pris"
drop_cols_lgbm_villa = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_filtered_lgbm_villa.columns]

X_lgbm_villa = df_filtered_lgbm_villa.drop(columns=[target_lgbm_villa] + drop_cols_lgbm_villa)
y_lgbm_villa = df_filtered_lgbm_villa[target_lgbm_villa]

# Log-transformera target_lgbm_villa
y_lgbm_villa_log = np.log1p(y_lgbm_villa)


# Gör kategoriska kolumner rätt
# Konvertera textkolumner till "category" innan split
for col in X_lgbm_villa.select_dtypes(include="object").columns:
    X_lgbm_villa[col] = X_lgbm_villa[col].astype("category")

# Train/test split
X_lgbm_villa_train, X_lgbm_villa_test, y_lgbm_villa_train, y_lgbm_villa_test = train_test_split(
    X_lgbm_villa, y_lgbm_villa_log, test_size=0.2, random_state=11)

# Lista index för kategoriska variabler
cat_features_lgbm_villa = [X_lgbm_villa_train.columns.get_loc(col) for col in X_lgbm_villa_train.select_dtypes(include="category").columns]

logging.info(f"Kategoriska features: {[X_lgbm_villa_train.columns[i] for i in cat_features_lgbm_villa]}")


# LightGBM modell
lgbm_model = LGBMRegressor(
    n_estimators=2000,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=11,
    n_jobs=-1)

lgbm_model.fit(
    X_lgbm_villa_train, y_lgbm_villa_train,
    eval_set=[(X_lgbm_villa_test, y_lgbm_villa_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_villa,
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)])

# Utvärdering
# Prediktion på log-skalan
y_lgbm_villa_train_pred_log = lgbm_model.predict(X_lgbm_villa_train)
y_lgbm_villa_test_pred_log = lgbm_model.predict(X_lgbm_villa_test)

# R² på log-skala
r2_train_lgbm_villa_log_lgbm_villa = r2_score(y_lgbm_villa_train, y_lgbm_villa_train_pred_log)
r2_test_lgbm_villa_log_lgbm_villa = r2_score(y_lgbm_villa_test, y_lgbm_villa_test_pred_log)

logging.info(f"Log-skala R² -> train: {r2_train_lgbm_villa_log_lgbm_villa:.3f}, test: {r2_test_lgbm_villa_log_lgbm_villa:.3f}")

# Back-transform till SEK
y_lgbm_villa_train_pred = np.expm1(y_lgbm_villa_train_pred_log)
y_lgbm_villa_test_pred = np.expm1(y_lgbm_villa_test_pred_log)

y_lgbm_villa_train_true = np.expm1(y_lgbm_villa_train)
y_lgbm_villa_test_true = np.expm1(y_lgbm_villa_test)

# R² och RMSE och MAPE
r2_train_lgbm_villa = r2_score(y_lgbm_villa_train_true, y_lgbm_villa_train_pred)
r2_test_lgbm_villa = r2_score(y_lgbm_villa_test_true, y_lgbm_villa_test_pred)

rmse_train_lgbm_villa = np.sqrt(mean_squared_error(y_lgbm_villa_train_true, y_lgbm_villa_train_pred))
rmse_test_lgbm_villa = np.sqrt(mean_squared_error(y_lgbm_villa_test_true, y_lgbm_villa_test_pred))

logging.info(f"LGBM tränad på villa-data R² train: {r2_train_lgbm_villa:.3f}, test: {r2_test_lgbm_villa:.3f}")
logging.info(f"LGBM tränad på villa-data RMSE train: {rmse_train_lgbm_villa:,.0f}, test: {rmse_test_lgbm_villa:,.0f} kr")

# Spara modellen
joblib.dump((lgbm_model, X_lgbm_villa.columns), "villa_lgbm.pkl")

logging.info("Modellen är sparad som 'villa_lgbm.pkl'")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11614
[LightGBM] [Info] Number of data points in the train set: 73949, number of used features: 7
[LightGBM] [Info] Start training from score 14.828778
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.37129	valid_0's l2: 0.137856
[200]	valid_0's rmse: 0.366457	valid_0's l2: 0.134291
[300]	valid_0's rmse: 0.365133	valid_0's l2: 0.133322
[400]	valid_0's rmse: 0.363996	valid_0's l2: 0.132493
[500]	valid_0's rmse: 0.363917	valid_0's l2: 0.132436
[600]	valid_0's rmse: 0.363579	valid_0's l2: 0.132189
[700]	valid_0's rmse: 0.363402	valid_0's l2: 0.132061
[800]	valid_0's rmse: 0.36338	valid_0's l2: 0.132045
[900]	valid_0's rmse: 0.363378	valid_0's l2: 0.132044
[1000]	valid_0's rmse: 0.363341	valid_0's l2: 0.132017
Early stopping, best iteration is:
[947]	valid_0's rmse:

   Nyckel  Våning  Rum  Boarea   Datum     Pris                Adress  \
0   31634       1  2.0      42  250630  1695000  Gamla Lasarettsgatan   
1   68573       2  3.0      79  250429  1765000        Gymnastikgatan   
3  142883       3  2.0      50  241120  2890000            Bruksgatan   
4   13020       2  2.0      55  250805  1712500   John Engellaus gata   
5   61959       7  3.0      79  250513  8600000     Slåttervallsgatan   

  Bostadstyp                  Område         Ort  Totalarea  
0   Lägenhet  Gamla Lasarettsområdet  Norrköping       42.5  
1   Lägenhet  Gamla Lasarettsområdet  Norrköping       79.0  
3   Lägenhet  Sandarna & Fixfabriken    Göteborg       50.5  
4   Lägenhet   Västra Munktellstaden  Eskilstuna       55.0  
5   Lägenhet   Norra Djurgårdsstaden   Stockholm       79.0  


In [16]:
# LGBM model för lägenhet med GS parametrar
from sklearn.metrics import r2_score, mean_squared_error

# Filtrera ut extremvärden
lower_lgbm_lagenhet = dataframes["lägenhet"]["Pris"].quantile(0.05)
upper_lgbm_lagenhet = dataframes["lägenhet"]["Pris"].quantile(0.95)

df_filtered_lgbm_lagenhet = dataframes["lägenhet"][
    (dataframes["lägenhet"]["Pris"] > lower_lgbm_lagenhet) & (dataframes["lägenhet"]["Pris"] < upper_lgbm_lagenhet)].copy()

# Features & target
target_lgbm_lagenhet = "Pris"
drop_cols_lgbm_lagenhet = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_filtered_lgbm_lagenhet.columns]

X_lgbm_lagenhet = df_filtered_lgbm_lagenhet.drop(columns=[target_lgbm_lagenhet] + drop_cols_lgbm_lagenhet)
y_lgbm_lagenhet = df_filtered_lgbm_lagenhet[target_lgbm_lagenhet]

# Log-transformera target_lgbm_lagenhet
y_lgbm_lagenhet_log = np.log1p(y_lgbm_lagenhet)


# Gör kategoriska kolumner rätt
# Konvertera textkolumner till "category" innan split
for col in X_lgbm_lagenhet.select_dtypes(include="object").columns:
    X_lgbm_lagenhet[col] = X_lgbm_lagenhet[col].astype("category")

# Train/test split
X_lgbm_lagenhet_train, X_lgbm_lagenhet_test, y_lgbm_lagenhet_train, y_lgbm_lagenhet_test = train_test_split(
    X_lgbm_lagenhet, y_lgbm_lagenhet_log, test_size=0.2, random_state=11)

# Lista index för kategoriska variabler
cat_features_lgbm_lagenhet = [X_lgbm_lagenhet_train.columns.get_loc(col) for col in X_lgbm_lagenhet_train.select_dtypes(include="category").columns]

logging.info(f"Kategoriska features: {[X_lgbm_lagenhet_train.columns[i] for i in cat_features_lgbm_lagenhet]}")


# LightGBM modell
lgbm_model_lagenhet = LGBMRegressor(
    n_estimators=2000,
    max_depth=10, # 12
    learning_rate=0.1, # 0.05
    subsample=0.6, #0.8
    colsample_bytree=0.6, # 0.8
    reg_lambda=10, # 1.0
    random_state=11,
    n_jobs=-1)

lgbm_model_lagenhet.fit(
    X_lgbm_lagenhet_train, y_lgbm_lagenhet_train,
    eval_set=[(X_lgbm_lagenhet_test, y_lgbm_lagenhet_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_lagenhet,
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)])

# Utvärdering
# Prediktion på log-skalan
y_lgbm_lagenhet_train_pred_log = lgbm_model_lagenhet.predict(X_lgbm_lagenhet_train)
y_lgbm_lagenhet_test_pred_log = lgbm_model_lagenhet.predict(X_lgbm_lagenhet_test)

# R² på log-skala
r2_train_lgbm_lagenhet_log_lgbm_lagenhet = r2_score(y_lgbm_lagenhet_train, y_lgbm_lagenhet_train_pred_log)
r2_test_lgbm_lagenhet_log_lgbm_lagenhet = r2_score(y_lgbm_lagenhet_test, y_lgbm_lagenhet_test_pred_log)

logging.info(f"Log-skala R² -> train: {r2_train_lgbm_lagenhet_log_lgbm_lagenhet:.3f}, test: {r2_test_lgbm_lagenhet_log_lgbm_lagenhet:.3f}")

# Back-transform till SEK
y_lgbm_lagenhet_train_pred = np.expm1(y_lgbm_lagenhet_train_pred_log)
y_lgbm_lagenhet_test_pred = np.expm1(y_lgbm_lagenhet_test_pred_log)

y_lgbm_lagenhet_train_true = np.expm1(y_lgbm_lagenhet_train)
y_lgbm_lagenhet_test_true = np.expm1(y_lgbm_lagenhet_test)

# R² och RMSE i SEK
r2_train_lgbm_lagenhet = r2_score(y_lgbm_lagenhet_train_true, y_lgbm_lagenhet_train_pred)
r2_test_lgbm_lagenhet = r2_score(y_lgbm_lagenhet_test_true, y_lgbm_lagenhet_test_pred)

rmse_train_lgbm_lagenhet = np.sqrt(mean_squared_error(y_lgbm_lagenhet_train_true, y_lgbm_lagenhet_train_pred))
rmse_test_lgbm_lagenhet = np.sqrt(mean_squared_error(y_lgbm_lagenhet_test_true, y_lgbm_lagenhet_test_pred))

logging.info(f"LGBM tränad på lägenhet-data R² train: {r2_train_lgbm_lagenhet:.3f}, test: {r2_test_lgbm_lagenhet:.3f}")
logging.info(f"LGBM tränad på lägenhet-data RMSE train: {rmse_train_lgbm_lagenhet:,.0f}, test: {rmse_test_lgbm_lagenhet:,.0f} kr")

# Spara modellen
joblib.dump((lgbm_model_lagenhet, X_lgbm_lagenhet.columns), "lagenhet_lgbm.pkl")

logging.info("Modellen är sparad som 'lagenhet_lgbm.pkl'")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8208
[LightGBM] [Info] Number of data points in the train set: 82223, number of used features: 6
[LightGBM] [Info] Start training from score 14.668723
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.176914	valid_0's l2: 0.0312984
[200]	valid_0's rmse: 0.171549	valid_0's l2: 0.0294289
[300]	valid_0's rmse: 0.16886	valid_0's l2: 0.0285136
[400]	valid_0's rmse: 0.167038	valid_0's l2: 0.0279018
[500]	valid_0's rmse: 0.165845	valid_0's l2: 0.0275046
[600]	valid_0's rmse: 0.164865	valid_0's l2: 0.0271804
[700]	valid_0's rmse: 0.164294	valid_0's l2: 0.0269925
[800]	valid_0's rmse: 0.163812	valid_0's l2: 0.0268345
[900]	valid_0's rmse: 0.1633	valid_0's l2: 0.0266668
[1000]	valid_0's rmse: 0.162921	valid_0's l2: 0.0265431
[1100]	valid_0's rmse: 0.16261	valid_0's l2: 0.0

In [52]:
# Grid search lgbm lägenhet
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Definiera parameterutrymme
param_grid = {
    "n_estimators": [1000, 2000, 3000],
    "max_depth": [6, 8, 10, 12, -1],   # -1 = ingen begränsning
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [0, 1, 5, 10]}

# Basmodell
lgbm_base = LGBMRegressor(
    random_state=11,
    n_jobs=-1)

# Grid Search
grid_search = GridSearchCV(
    estimator=lgbm_base,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",  # Optimera RMSE
    cv=3,
    verbose=2,
    n_jobs=-1)

logging.info("Startar GridSearchCV för LGBM...")

grid_search.fit(
    X_lgbm_lagenhet_train,
    y_lgbm_lagenhet_train,
    categorical_feature=cat_features_lgbm_lagenhet)

# Bästa parametrar
best_params = grid_search.best_params_
best_score = grid_search.best_score_

logging.info(f"Bästa parametrar: {best_params}")
logging.info(f"Bästa CV-score (RMSE): {best_score:.4f}")

# Träna om bästa modellen på hela träningsdatan
best_lgbm_model = grid_search.best_estimator_
best_lgbm_model.fit(
    X_lgbm_lagenhet_train,
    y_lgbm_lagenhet_train,
    eval_set=[(X_lgbm_lagenhet_test, y_lgbm_lagenhet_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_lagenhet,
    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=100)])

# Utvärdering på test
y_pred_test_log = best_lgbm_model.predict(X_lgbm_lagenhet_test)
y_pred_test = np.expm1(y_pred_test_log)
y_true_test = np.expm1(y_lgbm_lagenhet_test)

r2_test = r2_score(y_true_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_true_test, y_pred_test))

logging.info(f"Bästa LGBM R² test: {r2_test:.3f}")
logging.info(f"Bästa LGBM RMSE test: {rmse_test:,.0f} kr")

# Spara bästa modell
joblib.dump((best_lgbm_model, X_lgbm_lagenhet.columns), "lagenhet_lgbm_best.pkl")
logging.info("Bästa modellen är sparad som 'lagenhet_lgbm_best.pkl'")


Fitting 3 folds for each of 1620 candidates, totalling 4860 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8208
[LightGBM] [Info] Number of data points in the train set: 82223, number of used features: 6
[LightGBM] [Info] Start training from score 14.668723
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8208
[LightGBM] [Info] Number of data points in the train set: 82223, number of used features: 6
[LightGBM] [Info] Start training from score 14.668723
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.176914	valid_0's l2: 0.0312984
[200]	valid_0's rmse: 0.171549	valid_0's l2: 0.0294289
[300]	valid_0's rmse: 0.16886	valid_0's l2: 0.0285136
[400]	valid_0's rmse: 0.1

In [None]:

# Cat boost med randomized-searchade parametrar, lägenhet
from sklearn.metrics import r2_score, mean_squared_error # Deklarerar om

lower_cb_lagenhet = dataframes["lägenhet"]["Pris"].quantile(0.05)
upper_cb_lagenhet = dataframes["lägenhet"]["Pris"].quantile(0.95)

# Filtrera bort rader utanför percentilgränserna
df_filtered_cb_lagenhet = dataframes["lägenhet"][(dataframes["lägenhet"]["Pris"] > lower_cb_lagenhet) & (dataframes["lägenhet"]["Pris"] < upper_cb_lagenhet)]

# Välj dataframe
df_cb_lagenhet = df_filtered_cb_lagenhet.copy()

# Målvariabeln heter 'Pris'
target_cb_lagenhet = "Pris"

# Ta bort "Datum" och "Bostadstyp" om de finns
drop_cols_cb_lagenhet = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_cb_lagenhet.columns]
X_cb_lagenhet = df_cb_lagenhet.drop(columns=[target_cb_lagenhet] + drop_cols_cb_lagenhet)
y_cb_lagenhet = df_cb_lagenhet[target_cb_lagenhet]

# Identifiera kategoriska kolumner
cat_features_cb_lagenhet = X_cb_lagenhet.select_dtypes(include=["object", "category"]).columns.tolist()

logging.info(f"Kategoriska variabler som används: {cat_features_cb_lagenhet}")

# Train/test split
X_cb_lagenhet_train, X_cb_lagenhet_test, y_cb_lagenhet_train, y_cb_lagenhet_test = train_test_split(
    X_cb_lagenhet, y_cb_lagenhet, test_size=0.2, random_state=11)

# CatBoost-modell
model_cb_lagenhet = CatBoostRegressor(
    iterations=2000,
    depth=10,
    learning_rate=0.1,
    loss_function="RMSE",
    cat_features=cat_features_cb_lagenhet,
    verbose=100,
    bagging_temperature=0.25,
    l2_leaf_reg=7)

# Träna modellen
model_cb_lagenhet.fit(X_cb_lagenhet_train, y_cb_lagenhet_train, eval_set=(X_cb_lagenhet_test, y_cb_lagenhet_test), use_best_model=True)

# Utvärdera
r2_train_cb_lagenhet = model_cb_lagenhet.score(X_cb_lagenhet_train, y_cb_lagenhet_train)
r2_test_cb_lagenhet = model_cb_lagenhet.score(X_cb_lagenhet_test, y_cb_lagenhet_test)

# Prediktioner
y_pred_train_cb_lagenhet = model_cb_lagenhet.predict(X_cb_lagenhet_train)
y_pred_test_cb_lagenhet = model_cb_lagenhet.predict(X_cb_lagenhet_test)

# RMSE
rmse_train_cb_lagenhet = np.sqrt(mean_squared_error(y_cb_lagenhet_train, y_pred_train_cb_lagenhet))
rmse_test_cb_lagenhet = np.sqrt(mean_squared_error(y_cb_lagenhet_test, y_pred_test_cb_lagenhet))

logging.info(f"CatBoost tränad på lägenhet-data. "
    f"R² train: {r2_train_cb_lagenhet:.3f}, test: {r2_test_cb_lagenhet:.3f}, "
    f"RMSE train: {rmse_train_cb_lagenhet:.0f}, test: {rmse_test_cb_lagenhet:.0f}")

# Feature importance
feature_importances_cb_lagenhet = model_cb_lagenhet.get_feature_importance()
feature_names_cb_lagenhet = np.array(X_cb_lagenhet.columns)

# Sortera efter betydelse
sorted_idx_cb_lagenhet = feature_importances_cb_lagenhet.argsort()

# Logga topp 5 features
top_n_cb_lagenhet = 5
top_idx_cb_lagenhet = feature_importances_cb_lagenhet.argsort()[::-1][:top_n_cb_lagenhet]  # index på topp N
top_features_cb_lagenhet = [(feature_names_cb_lagenhet[i], feature_importances_cb_lagenhet[i]) for i in top_idx_cb_lagenhet]

logging.info("Topp 5 viktigaste features för pris:")
for name, importance in top_features_cb_lagenhet:
    logging.info(f"- {name}: {importance:.2f}")

# Spara modellen
joblib.dump(model_cb_lagenhet, "lagenhet_cb.pkl")

logging.info("Modellen är sparad som 'lagenhet_cb.pkl'")

0:	learn: 1270382.2465881	test: 1286212.2441417	best: 1286212.2441417 (0)	total: 112ms	remaining: 3m 44s
100:	learn: 500145.5428937	test: 469316.8376135	best: 469316.8376135 (100)	total: 11s	remaining: 3m 26s
200:	learn: 466765.7845243	test: 445541.3509018	best: 445541.3509018 (200)	total: 21.7s	remaining: 3m 14s
300:	learn: 443837.5009824	test: 432740.6277687	best: 432740.6277687 (300)	total: 33.4s	remaining: 3m 8s
400:	learn: 428336.3092626	test: 425319.6955994	best: 425319.6955994 (400)	total: 44.2s	remaining: 2m 56s
500:	learn: 415508.8589952	test: 420140.9770264	best: 420140.9770264 (500)	total: 55s	remaining: 2m 44s
600:	learn: 404477.4632128	test: 416262.8730167	best: 416262.8730167 (600)	total: 1m 5s	remaining: 2m 33s
700:	learn: 393440.0157354	test: 412906.8572493	best: 412906.8572493 (700)	total: 1m 17s	remaining: 2m 22s
800:	learn: 384242.6190253	test: 410721.5143266	best: 410721.5143266 (800)	total: 1m 28s	remaining: 2m 12s
900:	learn: 375752.1184114	test: 408642.6880783	be

In [44]:
# Randomized search för CB lägenhet
# Definiera parameterutrymme
param_dist = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "l2_leaf_reg": [1, 3, 5, 7, 10],
    "bagging_temperature": [0.25, 1, 2, 5, 10],
    "iterations": [500, 1000, 2000]}  # early stopping gör att vi inte alltid kör klart

# Basmodell
cb_base = CatBoostRegressor(
    loss_function="RMSE",
    cat_features=cat_features_cb_lagenhet,
    verbose=0,  # Stäng av spam under sökning
    random_state=11)

# Randomized Search
random_search_cb = RandomizedSearchCV(
    estimator=cb_base,
    param_distributions=param_dist,
    n_iter=30,  # antal slumpade kombinationer
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=11)

logging.info("Startar RandomizedSearchCV för CatBoost...")

# Fit modellen
random_search_cb.fit(
    X_cb_lagenhet_train, y_cb_lagenhet_train,
    eval_set=(X_cb_lagenhet_test, y_cb_lagenhet_test),
    use_best_model=True)

# Bästa parametrar
best_params_cb = random_search_cb.best_params_
best_score_cb = random_search_cb.best_score_

logging.info(f"Bästa CatBoost-parametrar: {best_params_cb}")
logging.info(f"Bästa CV-score (RMSE): {best_score_cb:.4f}")

# Träna om bästa modellen
best_cb_model = random_search_cb.best_estimator_
best_cb_model.fit(
    X_cb_lagenhet_train, y_cb_lagenhet_train,
    eval_set=(X_cb_lagenhet_test, y_cb_lagenhet_test),
    use_best_model=True,
    verbose=100)

# Utvärdera på test
r2_train_cb = best_cb_model.score(X_cb_lagenhet_train, y_cb_lagenhet_train)
r2_test_cb = best_cb_model.score(X_cb_lagenhet_test, y_cb_lagenhet_test)

logging.info(f"Bästa CatBoost R² train: {r2_train_cb:.3f}, test: {r2_test_cb:.3f}")

Fitting 3 folds for each of 30 candidates, totalling 90 fits
0:	learn: 1248948.0809594	test: 1261540.1712568	best: 1261540.1712568 (0)	total: 133ms	remaining: 4m 26s
100:	learn: 497330.5301038	test: 465227.1123951	best: 465227.1123951 (100)	total: 11.7s	remaining: 3m 40s
200:	learn: 463656.6606406	test: 441045.4945333	best: 441045.4945333 (200)	total: 23.5s	remaining: 3m 30s
300:	learn: 443275.7575992	test: 429745.7049866	best: 429741.3288174 (299)	total: 34.7s	remaining: 3m 15s
400:	learn: 427454.2216338	test: 422777.5933362	best: 422777.5933362 (400)	total: 45.6s	remaining: 3m 1s
500:	learn: 415894.3526547	test: 418192.4985290	best: 418192.4985290 (500)	total: 56.8s	remaining: 2m 50s
600:	learn: 405059.9073004	test: 414960.6493344	best: 414960.6493344 (600)	total: 1m 7s	remaining: 2m 37s
700:	learn: 394538.1767363	test: 412228.2540381	best: 412228.2540381 (700)	total: 1m 19s	remaining: 2m 26s
800:	learn: 385736.3835858	test: 410143.1062782	best: 410124.4069893 (798)	total: 1m 30s	rem

In [18]:
# LGBM model för fritidshus med randomized searchade parametrar
from sklearn.metrics import r2_score, mean_squared_error

# Filtrera ut extremvärden
lower_lgbm_fritidshus = dataframes["fritidshus"]["Pris"].quantile(0.05)
upper_lgbm_fritidshus = dataframes["fritidshus"]["Pris"].quantile(0.95)

df_filtered_lgbm_fritidshus = dataframes["fritidshus"][
    (dataframes["fritidshus"]["Pris"] > lower_lgbm_fritidshus) & (dataframes["fritidshus"]["Pris"] < upper_lgbm_fritidshus)].copy()

# Features & target
target_lgbm_fritidshus = "Pris"
drop_cols_lgbm_fritidshus = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_filtered_lgbm_fritidshus.columns]

X_lgbm_fritidshus = df_filtered_lgbm_fritidshus.drop(columns=[target_lgbm_fritidshus] + drop_cols_lgbm_fritidshus)
y_lgbm_fritidshus = df_filtered_lgbm_fritidshus[target_lgbm_fritidshus]

# Log-transformera target_lgbm_fritidshus
y_lgbm_fritidshus_log = np.log1p(y_lgbm_fritidshus)


# Gör kategoriska kolumner rätt
# Konvertera textkolumner till "category" innan split
for col in X_lgbm_fritidshus.select_dtypes(include="object").columns:
    X_lgbm_fritidshus[col] = X_lgbm_fritidshus[col].astype("category")

# Train/test split
X_lgbm_fritidshus_train, X_lgbm_fritidshus_test, y_lgbm_fritidshus_train, y_lgbm_fritidshus_test = train_test_split(
    X_lgbm_fritidshus, y_lgbm_fritidshus_log, test_size=0.2, random_state=11)

# Lista index för kategoriska variabler
cat_features_lgbm_fritidshus = [X_lgbm_fritidshus_train.columns.get_loc(col) for col in X_lgbm_fritidshus_train.select_dtypes(include="category").columns]

logging.info(f"Kategoriska features: {[X_lgbm_fritidshus_train.columns[i] for i in cat_features_lgbm_fritidshus]}")


# LightGBM modell
lgbm_model_fritidshus = LGBMRegressor(
    n_estimators=1000,
    max_depth=12,
    learning_rate=0.01,
    subsample=1.0,
    colsample_bytree=0.6,
    reg_lambda=10,
    random_state=11,
    n_jobs=-1)

lgbm_model_fritidshus.fit(
    X_lgbm_fritidshus_train, y_lgbm_fritidshus_train,
    eval_set=[(X_lgbm_fritidshus_test, y_lgbm_fritidshus_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_fritidshus,
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)])

# Utvärdering
# Prediktion på log-skalan
y_lgbm_fritidshus_train_pred_log = lgbm_model_fritidshus.predict(X_lgbm_fritidshus_train)
y_lgbm_fritidshus_test_pred_log = lgbm_model_fritidshus.predict(X_lgbm_fritidshus_test)

# R² på log-skala
r2_train_lgbm_fritidshus_log_lgbm_fritidshus = r2_score(y_lgbm_fritidshus_train, y_lgbm_fritidshus_train_pred_log)
r2_test_lgbm_fritidshus_log_lgbm_fritidshus = r2_score(y_lgbm_fritidshus_test, y_lgbm_fritidshus_test_pred_log)

logging.info(f"Log-skala R² -> train: {r2_train_lgbm_fritidshus_log_lgbm_fritidshus:.3f}, test: {r2_test_lgbm_fritidshus_log_lgbm_fritidshus:.3f}")

# Back-transform till SEK
y_lgbm_fritidshus_train_pred = np.expm1(y_lgbm_fritidshus_train_pred_log)
y_lgbm_fritidshus_test_pred = np.expm1(y_lgbm_fritidshus_test_pred_log)

y_lgbm_fritidshus_train_true = np.expm1(y_lgbm_fritidshus_train)
y_lgbm_fritidshus_test_true = np.expm1(y_lgbm_fritidshus_test)

# R² och RMSE i SEK
r2_train_lgbm_fritidshus = r2_score(y_lgbm_fritidshus_train_true, y_lgbm_fritidshus_train_pred)
r2_test_lgbm_fritidshus = r2_score(y_lgbm_fritidshus_test_true, y_lgbm_fritidshus_test_pred)

rmse_train_lgbm_fritidshus = np.sqrt(mean_squared_error(y_lgbm_fritidshus_train_true, y_lgbm_fritidshus_train_pred))
rmse_test_lgbm_fritidshus = np.sqrt(mean_squared_error(y_lgbm_fritidshus_test_true, y_lgbm_fritidshus_test_pred))

logging.info(f"LGBM tränad på fritidshus-data R² train: {r2_train_lgbm_fritidshus:.3f}, test: {r2_test_lgbm_fritidshus:.3f}")
logging.info(f"LGBM tränad på fritidshus-data RMSE train: {rmse_train_lgbm_fritidshus:,.0f}, test: {rmse_test_lgbm_fritidshus:,.0f} kr")

# Spara modellen
joblib.dump((lgbm_model_fritidshus, X_lgbm_fritidshus.columns), "fritidshus_lgbm.pkl")

logging.info("Modellen är sparad som 'fritidshus_lgbm.pkl'")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2049
[LightGBM] [Info] Number of data points in the train set: 8348, number of used features: 7
[LightGBM] [Info] Start training from score 14.226108
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.535205	valid_0's l2: 0.286444
[200]	valid_0's rmse: 0.49155	valid_0's l2: 0.241621
[300]	valid_0's rmse: 0.475459	valid_0's l2: 0.226061
[400]	valid_0's rmse: 0.469301	valid_0's l2: 0.220243
[500]	valid_0's rmse: 0.466483	valid_0's l2: 0.217606
[600]	valid_0's rmse: 0.465124	valid_0's l2: 0.216341
[700]	valid_0's rmse: 0.464548	valid_0's l2: 0.215805
[800]	valid_0's rmse: 0.464448	valid_0's l2: 0.215712
[900]	valid_0's rmse: 0.464398	valid_0's l2: 0.215666
Early stopping, best iteration is:
[876]	valid_0's rmse: 0.464342	valid_0's l2: 0.215613


In [47]:
# Randomized search LGBM fritidshus
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import joblib

# Definiera parameterutrymme
param_dist = {
    "n_estimators": [500, 1000, 2000, 3000],
    "max_depth": [6, 8, 10, 12, -1],   # -1 = ingen begränsning
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [0, 1, 5, 10]}

# Basmodell
lgbm_base_fritidshus = LGBMRegressor(random_state=11, n_jobs=-1)

# Randomized Search
random_search_lgbm_fritidshus = RandomizedSearchCV(
    estimator=lgbm_base_fritidshus,
    param_distributions=param_dist,
    n_iter=30,   # antal kombinationer att testa
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=11)

logging.info("Startar RandomizedSearchCV för LGBM (fritidshus)...")

# Kör sökning
random_search_lgbm_fritidshus.fit(
    X_lgbm_fritidshus_train, y_lgbm_fritidshus_train,
    categorical_feature=cat_features_lgbm_fritidshus)

# Bästa parametrar
best_params_fritidshus = random_search_lgbm_fritidshus.best_params_
best_score_fritidshus = random_search_lgbm_fritidshus.best_score_

logging.info(f"Bästa LGBM-parametrar (fritidshus): {best_params_fritidshus}")
logging.info(f"Bästa CV-score (RMSE, log-skala): {best_score_fritidshus:.4f}")

# Träna om bästa modellen på hela träningsdatan
best_lgbm_fritidshus = random_search_lgbm_fritidshus.best_estimator_
best_lgbm_fritidshus.fit(
    X_lgbm_fritidshus_train, y_lgbm_fritidshus_train,
    eval_set=[(X_lgbm_fritidshus_test, y_lgbm_fritidshus_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_fritidshus,
    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=100)])

# Utvärdera på test
y_test_pred_log = best_lgbm_fritidshus.predict(X_lgbm_fritidshus_test)
y_test_pred = np.expm1(y_test_pred_log)
y_test_true = np.expm1(y_lgbm_fritidshus_test)

r2_test = r2_score(y_test_true, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test_true, y_test_pred))

logging.info(f"Bästa LGBM R² test (fritidshus): {r2_test:.3f}")
logging.info(f"Bästa LGBM RMSE test (fritidshus): {rmse_test:,.0f} kr")

# Spara modellen
joblib.dump((best_lgbm_fritidshus, X_lgbm_fritidshus.columns), "fritidshus_lgbm_best.pkl")
logging.info("Bästa modellen är sparad som 'fritidshus_lgbm_best.pkl'")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2049
[LightGBM] [Info] Number of data points in the train set: 8348, number of used features: 7
[LightGBM] [Info] Start training from score 14.226108
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2049
[LightGBM] [Info] Number of data points in the train set: 8348, number of used features: 7
[LightGBM] [Info] Start training from score 14.226108
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.535205	valid_0's l2: 0.286444
[200]	valid_0's rmse: 0.49155	valid_0's l2: 0.241621
[300]	valid_0's rmse: 0.475459	valid_0's l2: 0.226061
[400]	valid_0's rmse: 0.469301	val

In [31]:

# Catboost med randomized-searchade parametrar, fritidshus
from sklearn.metrics import r2_score, mean_squared_error # Deklarerar om

lower_cb_fritidshus = dataframes["fritidshus"]["Pris"].quantile(0.05)
upper_cb_fritidshus = dataframes["fritidshus"]["Pris"].quantile(0.95)

# Filtrera bort rader utanför percentilgränserna
df_filtered_cb_fritidshus = dataframes["fritidshus"][(dataframes["fritidshus"]["Pris"] > lower_cb_fritidshus) & (dataframes["fritidshus"]["Pris"] < upper_cb_fritidshus)]

# Välj dataframe
df_cb_fritidshus = df_filtered_cb_fritidshus.copy()

# Målvariabeln heter 'Pris'
target_cb_fritidshus = "Pris"

# Ta bort "Datum" och "Bostadstyp" om de finns
drop_cols_cb_fritidshus = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_cb_fritidshus.columns]
X_cb_fritidshus = df_cb_fritidshus.drop(columns=[target_cb_fritidshus] + drop_cols_cb_fritidshus)
y_cb_fritidshus = df_cb_fritidshus[target_cb_fritidshus]

# Identifiera kategoriska kolumner
cat_features_cb_fritidshus = X_cb_fritidshus.select_dtypes(include=["object", "category"]).columns.tolist()

logging.info(f"Kategoriska variabler som används: {cat_features_cb_fritidshus}")

# Train/test split
X_cb_fritidshus_train, X_cb_fritidshus_test, y_cb_fritidshus_train, y_cb_fritidshus_test = train_test_split(
    X_cb_fritidshus, y_cb_fritidshus, test_size=0.2, random_state=11)

# CatBoost-modell
model_cb_fritidshus = CatBoostRegressor(
    iterations=1000, # 3000
    depth=6, # 10
    learning_rate=0.05, # 0.1
    loss_function="RMSE",
    cat_features=cat_features_cb_fritidshus,
    verbose=100,
    bagging_temperature=3, # 0.5
    l2_leaf_reg=5) # 10

# Träna modellen
model_cb_fritidshus.fit(X_cb_fritidshus_train, y_cb_fritidshus_train, eval_set=(X_cb_fritidshus_test, y_cb_fritidshus_test), use_best_model=True)

# Utvärdera
r2_train_cb_fritidshus = model_cb_fritidshus.score(X_cb_fritidshus_train, y_cb_fritidshus_train)
r2_test_cb_fritidshus = model_cb_fritidshus.score(X_cb_fritidshus_test, y_cb_fritidshus_test)

# Prediktion
y_pred_train_cb_fritidshus = model_cb_fritidshus.predict(X_cb_fritidshus_train)
y_pred_test_cb_fritidshus = model_cb_fritidshus.predict(X_cb_fritidshus_test)

# RMSE
rmse_train_cb_fritidshus = np.sqrt(mean_squared_error(y_cb_fritidshus_train, y_pred_train_cb_fritidshus))
rmse_test_cb_fritidshus = np.sqrt(mean_squared_error(y_cb_fritidshus_test, y_pred_test_cb_fritidshus))

logging.info(f"CatBoost tränad på fritidshus-data. "
    f"R² train: {r2_train_cb_fritidshus:.3f}, test: {r2_test_cb_fritidshus:.3f}, "
    f"RMSE train: {rmse_train_cb_fritidshus:.0f}, test: {rmse_test_cb_fritidshus:.0f}")

# Feature importance
feature_importances_cb_fritidshus = model_cb_fritidshus.get_feature_importance()
feature_names_cb_fritidshus = np.array(X_cb_fritidshus.columns)

# Sortera efter betydelse
sorted_idx_cb_fritidshus = feature_importances_cb_fritidshus.argsort()

# Logga topp 5 features
top_n_cb_fritidshus = 5
top_idx_cb_fritidshus = feature_importances_cb_fritidshus.argsort()[::-1][:top_n_cb_fritidshus]  # index på topp N
top_features_cb_fritidshus = [(feature_names_cb_fritidshus[i], feature_importances_cb_fritidshus[i]) for i in top_idx_cb_fritidshus]

logging.info("Topp 5 viktigaste features för pris:")
for name, importance in top_features_cb_fritidshus:
    logging.info(f"- {name}: {importance:.2f}")

# Spara modellen
joblib.dump(model_cb_fritidshus, "fritidshus_cb.pkl")

logging.info("Modellen är sparad som 'fritidshus_cb.pkl'")

0:	learn: 1127191.8185402	test: 1117557.0613531	best: 1117557.0613531 (0)	total: 39.5ms	remaining: 39.4s
100:	learn: 838217.7313457	test: 822687.7241297	best: 822687.7241297 (100)	total: 4.16s	remaining: 37s
200:	learn: 818256.3878541	test: 813561.3682400	best: 813561.3682400 (200)	total: 7.94s	remaining: 31.6s
300:	learn: 800904.0907475	test: 808126.7412364	best: 808022.6264001 (296)	total: 11.9s	remaining: 27.5s
400:	learn: 785658.1717243	test: 804042.9208790	best: 804042.9208790 (400)	total: 15.7s	remaining: 23.5s
500:	learn: 772088.9365895	test: 802552.1658008	best: 802441.9279216 (498)	total: 19.6s	remaining: 19.5s
600:	learn: 759871.8053485	test: 800255.1668322	best: 800247.2666139 (599)	total: 23.5s	remaining: 15.6s
700:	learn: 749171.8761364	test: 799955.8911928	best: 799619.0982075 (645)	total: 27.4s	remaining: 11.7s
800:	learn: 739926.2632814	test: 800467.5549465	best: 799619.0982075 (645)	total: 31.2s	remaining: 7.75s
900:	learn: 731730.5352962	test: 799678.3689549	best: 799

In [30]:
# Randomized search för catboost fritidshus
from sklearn.metrics import r2_score, mean_squared_error

# Identifiera kategoriska kolumner för fritidshus
cat_features_cb_fritidshus = X_cb_fritidshus.select_dtypes(include=["object", "category"]).columns.tolist()
logging.info(f"Kategoriska variabler för fritidshus: {cat_features_cb_fritidshus}")

# Definiera parameterutrymme för CatBoost
param_dist_cb_fritidshus = {
    "depth": [6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "iterations": [1000, 2000, 3000],
    "l2_leaf_reg": [1, 3, 5, 10],
    "bagging_temperature": [0.5, 1, 3, 5]}

# Basmodell
cb_base_fritidshus = CatBoostRegressor(
    loss_function="RMSE",
    cat_features=cat_features_cb_fritidshus,
    verbose=0,
    random_state=11)

# Randomized Search
random_search_cb_fritidshus = RandomizedSearchCV(
    estimator=cb_base_fritidshus,
    param_distributions=param_dist_cb_fritidshus,
    n_iter=20,   # slumpade kombinationer
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=11)

logging.info("Startar RandomizedSearchCV för CatBoost (fritidshus)...")

# Kör sökning
random_search_cb_fritidshus.fit(X_cb_fritidshus_train, y_cb_fritidshus_train)

# Bästa parametrar
best_params_cb_fritidshus = random_search_cb_fritidshus.best_params_
best_score_cb_fritidshus = random_search_cb_fritidshus.best_score_

logging.info(f"Bästa CatBoost-parametrar (fritidshus): {best_params_cb_fritidshus}")
logging.info(f"Bästa CV-score (RMSE): {best_score_cb_fritidshus:.4f}")

# Träna om bästa modellen på hela träningsdatan
best_cb_model_fritidshus = random_search_cb_fritidshus.best_estimator_
best_cb_model_fritidshus.fit(
    X_cb_fritidshus_train, y_cb_fritidshus_train,
    eval_set=(X_cb_fritidshus_test, y_cb_fritidshus_test),
    use_best_model=True)

# Utvärdera på test
y_test_pred_cb_fritidshus = best_cb_model_fritidshus.predict(X_cb_fritidshus_test)
r2_test_cb_fritidshus = r2_score(y_cb_fritidshus_test, y_test_pred_cb_fritidshus)
rmse_test_cb_fritidshus = np.sqrt(mean_squared_error(y_cb_fritidshus_test, y_test_pred_cb_fritidshus))

logging.info(f"Bästa CatBoost (fritidshus) R² test: {r2_test_cb_fritidshus:.3f}")
logging.info(f"Bästa CatBoost (fritidshus) RMSE test: {rmse_test_cb_fritidshus:,.0f} kr")

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [29]:
# Catboost med randomized-searchade parametrar, radhus
from sklearn.metrics import r2_score, mean_squared_error # Deklarerar om

lower_cb_radhus = dataframes["radhus"]["Pris"].quantile(0.05)
upper_cb_radhus = dataframes["radhus"]["Pris"].quantile(0.95)

# Filtrera bort rader utanför percentilgränserna
df_filtered_cb_radhus = dataframes["radhus"][(dataframes["radhus"]["Pris"] > lower_cb_radhus) & (dataframes["radhus"]["Pris"] < upper_cb_radhus)]

# Välj dataframe
df_cb_radhus = df_filtered_cb_radhus.copy()

# Målvariabeln heter 'Pris'
target_cb_radhus = "Pris"

# Ta bort "Datum" och "Bostadstyp" om de finns
drop_cols_cb_radhus = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_cb_radhus.columns]
X_cb_radhus = df_cb_radhus.drop(columns=[target_cb_radhus] + drop_cols_cb_radhus)
y_cb_radhus = df_cb_radhus[target_cb_radhus]

# Identifiera kategoriska kolumner
cat_features_cb_radhus = X_cb_radhus.select_dtypes(include=["object", "category"]).columns.tolist()

logging.info(f"Kategoriska variabler som används: {cat_features_cb_radhus}")

# Train/test split
X_cb_radhus_train, X_cb_radhus_test, y_cb_radhus_train, y_cb_radhus_test = train_test_split(
    X_cb_radhus, y_cb_radhus, test_size=0.2, random_state=11)

# CatBoost-modell
model_cb_radhus = CatBoostRegressor(
    iterations=3000, # 1000
    depth=8, # 6
    learning_rate=0.03, # 0.3
    loss_function="RMSE",
    cat_features=cat_features_cb_radhus,
    verbose=100,
    bagging_temperature=5, # 3
    l2_leaf_reg=5) # 10

# Träna modellen
model_cb_radhus.fit(X_cb_radhus_train, y_cb_radhus_train, eval_set=(X_cb_radhus_test, y_cb_radhus_test), use_best_model=True)

# Utvärdera
r2_train_cb_radhus = model_cb_radhus.score(X_cb_radhus_train, y_cb_radhus_train)
r2_test_cb_radhus = model_cb_radhus.score(X_cb_radhus_test, y_cb_radhus_test)

# Prediktion
y_pred_train_cb_radhus = model_cb_radhus.predict(X_cb_radhus_train)
y_pred_test_cb_radhus = model_cb_radhus.predict(X_cb_radhus_test)

# RMSE
rmse_train_cb_radhus = np.sqrt(mean_squared_error(y_cb_radhus_train, y_pred_train_cb_radhus))
rmse_test_cb_radhus = np.sqrt(mean_squared_error(y_cb_radhus_test, y_pred_test_cb_radhus))

logging.info(f"CatBoost tränad på radhus-data. "
    f"R² train: {r2_train_cb_radhus:.3f}, test: {r2_test_cb_radhus:.3f}, "
    f"RMSE train: {rmse_train_cb_radhus:.0f}, test: {rmse_test_cb_radhus:.0f}")

# Feature importance
feature_importances_cb_radhus = model_cb_radhus.get_feature_importance()
feature_names_cb_radhus = np.array(X_cb_radhus.columns)

# Sortera efter betydelse
sorted_idx_cb_radhus = feature_importances_cb_radhus.argsort()

# Logga topp 5 features
top_n_cb_radhus = 5
top_idx_cb_radhus = feature_importances_cb_radhus.argsort()[::-1][:top_n_cb_radhus]  # index på topp N
top_features_cb_radhus = [(feature_names_cb_radhus[i], feature_importances_cb_radhus[i]) for i in top_idx_cb_radhus]

logging.info("Topp 5 viktigaste features för pris:")
for name, importance in top_features_cb_radhus:
    logging.info(f"- {name}: {importance:.2f}")

# Spara modellen
joblib.dump(model_cb_radhus, "radhus_cb.pkl")

logging.info("Modellen är sparad som 'radhus_cb.pkl'")

0:	learn: 1484920.2218664	test: 1497017.5020515	best: 1497017.5020515 (0)	total: 48.3ms	remaining: 2m 24s
100:	learn: 876211.1272137	test: 834671.8316998	best: 834671.8316998 (100)	total: 5.3s	remaining: 2m 32s
200:	learn: 832789.4593476	test: 793420.7414320	best: 793420.7414320 (200)	total: 10.5s	remaining: 2m 25s
300:	learn: 811512.7475461	test: 777433.6129536	best: 777433.6129536 (300)	total: 17.8s	remaining: 2m 39s
400:	learn: 795449.6611924	test: 768163.8024804	best: 768163.8024804 (400)	total: 24.6s	remaining: 2m 39s
500:	learn: 780118.3026005	test: 761983.4026823	best: 761983.4026823 (500)	total: 31.7s	remaining: 2m 38s
600:	learn: 765636.2676380	test: 757078.7407562	best: 757078.7407562 (600)	total: 39.1s	remaining: 2m 36s
700:	learn: 752434.5159236	test: 753123.1800517	best: 753123.1800517 (700)	total: 46.5s	remaining: 2m 32s
800:	learn: 739384.7608354	test: 749885.4107893	best: 749885.4107893 (800)	total: 54.5s	remaining: 2m 29s
900:	learn: 726356.6626614	test: 746201.6595875

In [None]:
# Randomized search för catboost radhus
from sklearn.metrics import r2_score, mean_squared_error

# Identifiera kategoriska kolumner för radhus
cat_features_cb_radhus = X_cb_radhus.select_dtypes(include=["object", "category"]).columns.tolist()
logging.info(f"Kategoriska variabler för radhus: {cat_features_cb_radhus}")

# Definiera parameterutrymme för CatBoost
param_dist_cb_radhus = {
    "depth": [6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "iterations": [1000, 2000, 3000],
    "l2_leaf_reg": [1, 3, 5, 10],
    "bagging_temperature": [0.5, 1, 3, 5]}

# Basmodell
cb_base_radhus = CatBoostRegressor(
    loss_function="RMSE",
    cat_features=cat_features_cb_radhus,
    verbose=0,
    random_state=11)

# Randomized Search
random_search_cb_radhus = RandomizedSearchCV(
    estimator=cb_base_radhus,
    param_distributions=param_dist_cb_radhus,
    n_iter=20,   # slumpade kombinationer
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=11)

logging.info("Startar RandomizedSearchCV för CatBoost (radhus)...")

# Kör sökning
random_search_cb_radhus.fit(X_cb_radhus_train, y_cb_radhus_train)

# Bästa parametrar
best_params_cb_radhus = random_search_cb_radhus.best_params_
best_score_cb_radhus = random_search_cb_radhus.best_score_

logging.info(f"Bästa CatBoost-parametrar (radhus): {best_params_cb_radhus}")
logging.info(f"Bästa CV-score (RMSE): {best_score_cb_radhus:.4f}")

# Träna om bästa modellen på hela träningsdatan
best_cb_model_radhus = random_search_cb_radhus.best_estimator_
best_cb_model_radhus.fit(
    X_cb_radhus_train, y_cb_radhus_train,
    eval_set=(X_cb_radhus_test, y_cb_radhus_test),
    use_best_model=True)

# Utvärdera på test
y_test_pred_cb_radhus = best_cb_model_radhus.predict(X_cb_radhus_test)
r2_test_cb_radhus = r2_score(y_cb_radhus_test, y_test_pred_cb_radhus)
rmse_test_cb_radhus = np.sqrt(mean_squared_error(y_cb_radhus_test, y_test_pred_cb_radhus))

logging.info(f"Bästa CatBoost (radhus) R² test: {r2_test_cb_radhus:.3f}")
logging.info(f"Bästa CatBoost (radhus) RMSE test: {rmse_test_cb_radhus:,.0f} kr")

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [34]:
# LGBM model för radhus med randomized searchade parametrar
from sklearn.metrics import r2_score, mean_squared_error

# Filtrera ut extremvärden
lower_lgbm_radhus = dataframes["radhus"]["Pris"].quantile(0.05)
upper_lgbm_radhus = dataframes["radhus"]["Pris"].quantile(0.95)

df_filtered_lgbm_radhus = dataframes["radhus"][
    (dataframes["radhus"]["Pris"] > lower_lgbm_radhus) & (dataframes["radhus"]["Pris"] < upper_lgbm_radhus)].copy()

# Features & target
target_lgbm_radhus = "Pris"
drop_cols_lgbm_radhus = [col for col in ["Datum", "Bostadstyp", "Nyckel", "Totalarea"] if col in df_filtered_lgbm_radhus.columns]

X_lgbm_radhus = df_filtered_lgbm_radhus.drop(columns=[target_lgbm_radhus] + drop_cols_lgbm_radhus)
y_lgbm_radhus = df_filtered_lgbm_radhus[target_lgbm_radhus]

# Log-transformera target_lgbm_radhus
y_lgbm_radhus_log = np.log1p(y_lgbm_radhus)


# Gör kategoriska kolumner rätt
# Konvertera textkolumner till "category" innan split
for col in X_lgbm_radhus.select_dtypes(include="object").columns:
    X_lgbm_radhus[col] = X_lgbm_radhus[col].astype("category")

# Train/test split
X_lgbm_radhus_train, X_lgbm_radhus_test, y_lgbm_radhus_train, y_lgbm_radhus_test = train_test_split(
    X_lgbm_radhus, y_lgbm_radhus_log, test_size=0.2, random_state=11)

# Lista index för kategoriska variabler
cat_features_lgbm_radhus = [X_lgbm_radhus_train.columns.get_loc(col) for col in X_lgbm_radhus_train.select_dtypes(include="category").columns]

logging.info(f"Kategoriska features: {[X_lgbm_radhus_train.columns[i] for i in cat_features_lgbm_radhus]}")


# LightGBM modell
lgbm_model_radhus = LGBMRegressor(
    n_estimators=500, # 1000
    max_depth=10, # 12
    learning_rate=0.03, # 0,01
    subsample=1.0,
    colsample_bytree=1, # 0,6
    reg_lambda=0, # 10
    random_state=11,
    n_jobs=-1)

lgbm_model_radhus.fit(
    X_lgbm_radhus_train, y_lgbm_radhus_train,
    eval_set=[(X_lgbm_radhus_test, y_lgbm_radhus_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_radhus,
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)])

# Utvärdering
# Prediktion på log-skalan
y_lgbm_radhus_train_pred_log = lgbm_model_radhus.predict(X_lgbm_radhus_train)
y_lgbm_radhus_test_pred_log = lgbm_model_radhus.predict(X_lgbm_radhus_test)

# R² på log-skala
r2_train_lgbm_radhus_log_lgbm_radhus = r2_score(y_lgbm_radhus_train, y_lgbm_radhus_train_pred_log)
r2_test_lgbm_radhus_log_lgbm_radhus = r2_score(y_lgbm_radhus_test, y_lgbm_radhus_test_pred_log)

logging.info(f"Log-skala R² -> train: {r2_train_lgbm_radhus_log_lgbm_radhus:.3f}, test: {r2_test_lgbm_radhus_log_lgbm_radhus:.3f}")

# Back-transform till SEK
y_lgbm_radhus_train_pred = np.expm1(y_lgbm_radhus_train_pred_log)
y_lgbm_radhus_test_pred = np.expm1(y_lgbm_radhus_test_pred_log)

y_lgbm_radhus_train_true = np.expm1(y_lgbm_radhus_train)
y_lgbm_radhus_test_true = np.expm1(y_lgbm_radhus_test)

# R² och RMSE i SEK
r2_train_lgbm_radhus = r2_score(y_lgbm_radhus_train_true, y_lgbm_radhus_train_pred)
r2_test_lgbm_radhus = r2_score(y_lgbm_radhus_test_true, y_lgbm_radhus_test_pred)

rmse_train_lgbm_radhus = np.sqrt(mean_squared_error(y_lgbm_radhus_train_true, y_lgbm_radhus_train_pred))
rmse_test_lgbm_radhus = np.sqrt(mean_squared_error(y_lgbm_radhus_test_true, y_lgbm_radhus_test_pred))

logging.info(f"LGBM tränad på radhus-data R² train: {r2_train_lgbm_radhus:.3f}, test: {r2_test_lgbm_radhus:.3f}")
logging.info(f"LGBM tränad på radhus-data RMSE train: {rmse_train_lgbm_radhus:,.0f}, test: {rmse_test_lgbm_radhus:,.0f} kr")

# Spara modellen
joblib.dump((lgbm_model_radhus, X_lgbm_radhus.columns), "radhus_lgbm.pkl")

logging.info("Modellen är sparad som 'radhus_lgbm.pkl'")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3204
[LightGBM] [Info] Number of data points in the train set: 11822, number of used features: 7
[LightGBM] [Info] Start training from score 15.100396
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.230208	valid_0's l2: 0.0529957
[200]	valid_0's rmse: 0.224041	valid_0's l2: 0.0501945
[300]	valid_0's rmse: 0.222703	valid_0's l2: 0.0495965
[400]	valid_0's rmse: 0.222447	valid_0's l2: 0.0494828
[500]	valid_0's rmse: 0.222013	valid_0's l2: 0.0492899
Did not meet early stopping. Best iteration is:
[498]	valid_0's rmse: 0.222002	valid_0's l2: 0.0492848


In [None]:
# Randomized search LGBM radhus
from sklearn.metrics import r2_score, mean_squared_error

# Definiera parameterutrymme
param_dist = {
    "n_estimators": [500, 1000, 2000, 3000],
    "max_depth": [6, 8, 10, 12, -1],   # -1 = ingen begränsning
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [0, 1, 5, 10]}

# Basmodell
lgbm_base_radhus = LGBMRegressor(random_state=11, n_jobs=-1)

# Randomized Search
random_search_lgbm_radhus = RandomizedSearchCV(
    estimator=lgbm_base_radhus,
    param_distributions=param_dist,
    n_iter=30,   # antal kombinationer att testa
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=11)

logging.info("Startar RandomizedSearchCV för LGBM (radhus)...")

# Kör sökning
random_search_lgbm_radhus.fit(
    X_lgbm_radhus_train, y_lgbm_radhus_train,
    categorical_feature=cat_features_lgbm_radhus)

# Bästa parametrar
best_params_radhus = random_search_lgbm_radhus.best_params_
best_score_radhus = random_search_lgbm_radhus.best_score_

logging.info(f"Bästa LGBM-parametrar (radhus): {best_params_radhus}")
logging.info(f"Bästa CV-score (RMSE, log-skala): {best_score_radhus:.4f}")

# Träna om bästa modellen på hela träningsdatan
best_lgbm_radhus = random_search_lgbm_radhus.best_estimator_
best_lgbm_radhus.fit(
    X_lgbm_radhus_train, y_lgbm_radhus_train,
    eval_set=[(X_lgbm_radhus_test, y_lgbm_radhus_test)],
    eval_metric="rmse",
    categorical_feature=cat_features_lgbm_radhus,
    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(period=100)])

# Utvärdera på test
y_test_pred_log = best_lgbm_radhus.predict(X_lgbm_radhus_test)
y_test_pred = np.expm1(y_test_pred_log)
y_test_true = np.expm1(y_lgbm_radhus_test)

r2_test = r2_score(y_test_true, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test_true, y_test_pred))

logging.info(f"Bästa LGBM R² test (radhus): {r2_test:.3f}")
logging.info(f"Bästa LGBM RMSE test (radhus): {rmse_test:,.0f} kr")

# Spara modellen
joblib.dump((best_lgbm_radhus, X_lgbm_radhus.columns), "radhus_lgbm_best.pkl")
logging.info("Bästa modellen är sparad som 'radhus_lgbm_best.pkl'")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3204
[LightGBM] [Info] Number of data points in the train set: 11822, number of used features: 7
[LightGBM] [Info] Start training from score 15.100396
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3204
[LightGBM] [Info] Number of data points in the train set: 11822, number of used features: 7
[LightGBM] [Info] Start training from score 15.100396
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.230208	valid_0's l2: 0.0529957
[200]	valid_0's rmse: 0.224041	valid_0's l2: 0.0501945
[300]	valid_0's rms