In [52]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("preprocessed_apartment_rentals_clusters.csv")
# Beispiel-Daten (ersetze X, y durch deine Daten)
features = data.drop(columns=["price", "cityname", "state"]).columns
X, y = data.drop(columns=["price", "cityname", "state"]).values, data["price"].values
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Regression

In [42]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score


# Nested CV Parameter
outer_cv = KFold(n_splits=5, shuffle=True, random_state=0)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=1)

# Hyperparameter-Suche im inneren CV
param_grid = {"alpha": np.logspace(-3, 3, 20)}
model = Ridge()

grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=inner_cv,
    scoring="neg_mean_squared_error",
    n_jobs=1
)

outer_mse = []
outer_r2 = []
best_params_per_fold = []

for train_ix, test_ix in outer_cv.split(X):
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    # Inner CV + Fit
    grid_search.fit(X_train, y_train)
    
    # Bestes Modell bewerten
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    outer_mse.append(mse)
    outer_r2.append(r2)
    best_params_per_fold.append(grid_search.best_params_)
print("Outer MSE Scores:", outer_mse)
print("Mean MSE:", np.mean(outer_mse))
print("Std MSE:", np.std(outer_mse))
print("Outer R² Scores:", outer_r2)
print("Mean R²:", np.mean(outer_r2))
print("Best params", best_params_per_fold)



Outer MSE Scores: [218490.50167903118, 214575.06273971553, 215577.7280298093, 220096.13097939227, 215098.6776672528]
Mean MSE: 216767.6202190402
Std MSE: 2147.3623527603345
Outer R² Scores: [0.538220142353645, 0.5391803458701592, 0.534067536249452, 0.5350063530970879, 0.5280598269507408]
Mean R²: 0.534906840904217
Best params [{'alpha': np.float64(26.366508987303554)}, {'alpha': np.float64(26.366508987303554)}, {'alpha': np.float64(26.366508987303554)}, {'alpha': np.float64(12.742749857031322)}, {'alpha': np.float64(26.366508987303554)}]


In [None]:
ridge = Ridge(alpha=26.0)  # Standard Alpha, kann angepasst werden

# 5️⃣ Modell auf allen Daten fitten
ridge.fit(X, y)
y_pred = ridge.predict(X)
print(r2_score(y, y_pred))


importance = pd.Series(ridge.coef_, index= features)
importance = importance.sort_values(key=abs, ascending=False)
print("\nFeature Importances (absolut sortiert):")
print(importance)

0.5358008953007783

Feature Importances (absolut sortiert):
square_feet             215.444125
latitude                171.893677
state_CA                164.990874
cluster_12              138.563577
cluster_2              -136.251797
                           ...    
state_MS                 -3.776731
source_Andere source      2.530247
state_TN                  0.945672
cluster_14                0.847723
sauna                     0.000000
Length: 90, dtype: float64
