In [28]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [30]:
df.describe(include='object')

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


In [31]:
df.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [32]:
df['MSZoning'].value_counts()

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64

In [33]:
missing = df.isna().sum().sort_values(ascending=False)
missing[missing > 0]

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtQual          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [34]:
# Missing-Value Treatment

none_fill = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageQual', 'GarageFinish', 'GarageType', 'GarageCond',
    'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType2', 'BsmtFinType1'
]

median_fill = [
    'LotFrontage', 'MasVnrArea'
]

modus_fill = [
    'MasVnrType', 'Electrical'
]

zero_fill = [
    'GarageYrBlt'
]

for col in none_fill:
    df[col] = df[col].fillna("None")

for col in median_fill:
    df[col] = df[col].fillna(df[col].median())

for col in modus_fill:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in zero_fill:
    df[col] = df[col].fillna(0)

df.isna().sum().sum()
df.isna().sum().sort_values(ascending=False).head(10)

Id             0
MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
Alley          0
LotShape       0
LandContour    0
Utilities      0
dtype: int64

In [None]:
# Feature Engineering

df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['TotalBath'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath'])
df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch'] + df['WoodDeckSF']


In [36]:
# Ordinal Encoding

quality_mapping_ExGdTAFaPoNone = {
    # ExterQual, ExterCond, BsmtQual, BsmtCond, HeatingQC, KitchenQual, FireplaceQu, GarageQual, GarageCond, PoolQC
    'Ex': 5,        # Excellent
    'Gd': 4,        # Good                             
    'TA': 3,        # Typical/Average
    'Fa': 2,        # Fair
    'Po': 1,        # Poor
    'None': 0       # None
}

quality_mapping_GdAvMnNoNone = {
    # BsmtExposure
    'Gd': 4,        # Good Exposure
    'Av': 3,        # Average Exposure
    'Mn': 2,        # Minimum Exposure
    'No': 1,        # No Exposure
    'None': 0       # None
}

quality_mapping_GLQALQBLQRecLwQUnfNone = {
    # BsmtFinType1, BsmtFinType2
    'GLQ': 6,      # Good Living Quarters
    'ALQ': 5,      # Average Living Quarters
    'BLQ': 4,      # Below Average Living Quarters
    'Rec': 3,      # Average Rec Room
    'LwQ': 2,      # Low Quality
    'Unf': 1,      # Unfinshed
    'None': 0      # None
}
quality_mapping_TypMin1Min2ModMaj1Maj2SevSal = {
    # Functional
    'Typ': 7,       # Typical Functionality
    'Min1': 6,      # Minor Deductions 1
    'Min2': 5,      # Minor Deductions 2
    'Mod': 4,       # Moderate Deductions
    'Maj1': 3,      # Major Deductions 1
    'Maj2': 2,      # Major Deductions 2
    'Sev': 1,       # Severely Damaged
    'Sal': 0        # Salvage Only
}

quality_mapping_FinRFnUnfNone = {
    # GarageFinish
    'Fin': 3,       # Finished
    'RFn': 2,       # Rough Finished
    'Unf': 1,       # Unfinished
    'None': 0       # None
}

quality_mapping_YPN = {
    # PavedDrive
    'Y': 2,        # Paved
    'P': 1,        # Partially Paved
    'N': 0         # Dirt/Gravel
}


# Columns to encode

cols_ExGdTAFaPoNone = [
    'ExterQual', 'ExterCond',
    'BsmtQual', 'BsmtCond',
    'HeatingQC', 'KitchenQual',
    'FireplaceQu',
    'GarageQual', 'GarageCond',
    'PoolQC'
]

cols_GdAvMnNoNone = ['BsmtExposure']

cols_GLQALQBLQRecLwQUnfNone = ['BsmtFinType1', 'BsmtFinType2']

cols_TypMin1Min2ModMaj1Maj2SevSal = ['Functional']

cols_FinRFnUnfNone = ['GarageFinish']

cols_YPN = ['PavedDrive']


# Mapping

for col in cols_ExGdTAFaPoNone:
    df[col] = df[col].map(quality_mapping_ExGdTAFaPoNone)

for col in cols_GdAvMnNoNone:
    df[col] = df[col].map(quality_mapping_GdAvMnNoNone)

for col in cols_GLQALQBLQRecLwQUnfNone:
    df[col] = df[col].map(quality_mapping_GLQALQBLQRecLwQUnfNone)

for col in cols_TypMin1Min2ModMaj1Maj2SevSal:
    df[col] = df[col].map(quality_mapping_TypMin1Min2ModMaj1Maj2SevSal)

for col in cols_FinRFnUnfNone:
    df[col] = df[col].map(quality_mapping_FinRFnUnfNone)

for col in cols_YPN:
    df[col] = df[col].map(quality_mapping_YPN)


df[cols_ExGdTAFaPoNone].head()
df[cols_ExGdTAFaPoNone].dtypes


ExterQual      int64
ExterCond      int64
BsmtQual       int64
BsmtCond       int64
HeatingQC      int64
KitchenQual    int64
FireplaceQu    int64
GarageQual     int64
GarageCond     int64
PoolQC         int64
dtype: object

In [37]:
df.dtypes.value_counts()

df.select_dtypes(include='object').columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
       'Electrical', 'GarageType', 'Fence', 'MiscFeature', 'SaleType',
       'SaleCondition'],
      dtype='object')

In [38]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

numeric_features, categorical_features

(Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
        'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual',
        'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
        'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
        'HeatingQC', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
        'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
        'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
        'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
        'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
        'ScreenPorch', 'PoolArea', 'PoolQC', 'MiscVal', 'MoSold', 'YrSold',
        'SalePrice', 'HouseAge', 'RemodAge', 'TotalSF', 'TotalBath',
        'TotalPorchSF'],
       dtype='object'),
 Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        (
            'cat',  # Name des Blocks
            OneHotEncoder(handle_unknown='ignore', sparse_output=False),
            categorical_features
        )
    ],
    remainder='passthrough'  # alle numerischen Spalten einfach so weiterreichen
)

In [40]:
# Splitting

target = "SalePrice"

X = df.drop(columns=[target])
Y = df[target]

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
)

X.dtypes

Id                int64
MSSubClass        int64
MSZoning         object
LotFrontage     float64
LotArea           int64
                 ...   
HouseAge          int64
RemodAge          int64
TotalSF           int64
TotalBath       float64
TotalPorchSF      int64
Length: 85, dtype: object

In [41]:
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

problem_cols = []

for col in categorical_features:
    types_in_col = {type(v) for v in df[col].dropna().unique()}
    if len(types_in_col) > 1:
        print(col, "→", types_in_col)
        problem_cols.append(col)

problem_cols

[]

In [42]:
# Preprocessing

# Preprocessor an den Trainingsdaten fitten
preprocessor.fit(X_train)

# Trainings- und Testdaten transformieren
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

X_train_prep.shape, X_test_prep.shape

X_train_prep[:5]

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 2.628e+03, 2.000e+00,
        2.500e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 2.370e+03, 2.500e+00,
        4.000e+01],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.592e+03, 1.000e+00,
        4.920e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 2.499e+03, 2.500e+00,
        2.640e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 2.717e+03, 2.000e+00,
        2.420e+02]], shape=(5, 238))

In [43]:
# 1. OneHotEncoder aus dem ColumnTransformer holen
ohe = preprocessor.named_transformers_["cat"]

# 2. Namen der OHE-Spalten bestimmen
cat_feature_names = ohe.get_feature_names_out(categorical_features)

# 3. Numerische Features sind alle, die NICHT in categorical_features sind
numeric_features = [col for col in X_train.columns if col not in categorical_features]

# 4. Gesamt-Reihenfolge im Output:
#    erst alle OHE-Spalten, dann alle numerischen
all_feature_names = list(cat_feature_names) + numeric_features

# 5. DataFrame aus dem vorbereiteten Array bauen
X_train_prep_df = pd.DataFrame(
    X_train_prep,
    columns=all_feature_names,
    index=X_train.index,
)

X_train_prep_df.head()


Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,...,PoolArea,PoolQC,MiscVal,MoSold,YrSold,HouseAge,RemodAge,TotalSF,TotalBath,TotalPorchSF
254,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,6.0,2010.0,53.0,53.0,2628.0,2.0,250.0
1066,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,5.0,2009.0,16.0,15.0,2370.0,2.5,40.0
638,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,5.0,2008.0,98.0,58.0,1592.0,1.0,492.0
799,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,6.0,2007.0,70.0,57.0,2499.0,2.5,264.0
380,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,5.0,2010.0,86.0,60.0,2717.0,2.0,242.0


In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

linreg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", LinearRegression())
    ]
)

linreg_model.fit(X_train, Y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [45]:
from sklearn.metrics import r2_score, mean_squared_error

# Vorhersagen auf dem Test-Set
Y_pred = linreg_model.predict(X_test)

# Gütemaße berechnen
r2 = r2_score(Y_test, Y_pred)
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))

print("R²:", r2)
print("RMSE:", rmse)


R²: 0.8770157273112319
RMSE: 30713.669430344944


In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

rf_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        ))
    ]
)

rf_model.fit(X_train, Y_train)


0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
from sklearn.metrics import r2_score, mean_squared_error

Y_pred_rf = rf_model.predict(X_test)

r2_rf = r2_score(Y_test, Y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(Y_test, Y_pred_rf))

print("R² (RandomForest):", r2_rf)
print("RMSE (RandomForest):", rmse_rf)


R² (RandomForest): 0.8890403243062452
RMSE (RandomForest): 29173.565233183133


In [48]:
from sklearn.metrics import r2_score, mean_squared_error

Y_pred_rf_train = rf_model.predict(X_train)

r2_rf_train = r2_score(Y_train, Y_pred_rf_train)
rmse_rf_train = np.sqrt(mean_squared_error(Y_train, Y_pred_rf_train))

print("R² (RF – Train):", r2_rf_train)
print("RMSE (RF – Train):", rmse_rf_train)


R² (RF – Train): 0.9802772257809867
RMSE (RF – Train): 10846.081341117795


In [49]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

rf_small = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", RandomForestRegressor(
            n_estimators=300,
            max_depth=10,        # Bäume gedeckelt
            min_samples_leaf=5,  # Blattknoten brauchen mind. 5 Samples
            random_state=42,
            n_jobs=-1
        ))
    ]
)

rf_small.fit(X_train, Y_train)


0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [50]:
from sklearn.metrics import r2_score, mean_squared_error

Y_pred_rf_small = rf_small.predict(X_test)

r2_rf_small = r2_score(Y_test, Y_pred_rf_small)
rmse_rf_small = np.sqrt(mean_squared_error(Y_test, Y_pred_rf_small))

print("R² (RF klein – Test):", r2_rf_small)
print("RMSE (RF klein – Test):", rmse_rf_small)


R² (RF klein – Test): 0.8664851116838035
RMSE (RF klein – Test): 32001.60538006467


In [51]:
from sklearn.metrics import r2_score, mean_squared_error

Y_pred_rf_small_train = rf_small.predict(X_train)

r2_rf_small_train = r2_score(Y_train, Y_pred_rf_small_train)
rmse_rf_small_train = np.sqrt(mean_squared_error(Y_train, Y_pred_rf_small_train))

print("R² (RF klein – Train):", r2_rf_small_train)
print("RMSE (RF klein – Train):", rmse_rf_small_train)


R² (RF klein – Train): 0.9360463142428435
RMSE (RF klein – Train): 19530.872501518486


In [52]:
# 1. Regressor und OHE holen
rf = rf_model.named_steps["regressor"]
ohe = preprocessor.named_transformers_["cat"]

# 2. Feature-Namen rekonstruieren
cat_feature_names = ohe.get_feature_names_out(categorical_features)
numeric_features = [col for col in X_train.columns if col not in categorical_features]
all_feature_names = list(cat_feature_names) + numeric_features

# 3. Wichtigkeiten holen und sortieren
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# 4. Top 15 Features ausgeben
top_n = 15
for i in range(top_n):
    idx = indices[i]
    print(f"{i+1:2d}. {all_feature_names[idx]}: {importances[idx]:.4f}")


 1. TotalSF: 0.3988
 2. OverallQual: 0.3502
 3. 2ndFlrSF: 0.0249
 4. KitchenQual: 0.0125
 5. YearBuilt: 0.0125
 6. GrLivArea: 0.0108
 7. BsmtQual: 0.0108
 8. LotArea: 0.0107
 9. BsmtFinSF1: 0.0100
10. HouseAge: 0.0099
11. GarageCars: 0.0097
12. TotalBath: 0.0097
13. LotFrontage: 0.0094
14. GarageArea: 0.0076
15. BsmtUnfSF: 0.0063


In [53]:
from sklearn.model_selection import cross_val_score

# 5-fache Cross-Validation mit negativem RMSE (sklearn gibt negative Werte aus)
cv_scores = cross_val_score(
    rf_model,
    X_train,
    Y_train,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

rmse_scores = -cv_scores  # Vorzeichen umdrehen
print("CV-RMSE pro Fold:", rmse_scores)
print("Durchschnitt:", rmse_scores.mean())
print("Std-Abweichung:", rmse_scores.std())


CV-RMSE pro Fold: [28308.87195881 34787.19072238 34556.55319811 24612.92482603
 25201.42492675]
Durchschnitt: 29493.393126414878
Std-Abweichung: 4411.415535926202


In [54]:
from sklearn.model_selection import cross_val_score

cv_scores_lin = cross_val_score(
    linreg_model,
    X_train,
    Y_train,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

rmse_scores_lin = -cv_scores_lin
print("CV-RMSE (LinearRegression) pro Fold:", rmse_scores_lin)
print("Durchschnitt:", rmse_scores_lin.mean())
print("Std-Abweichung:", rmse_scores_lin.std())


CV-RMSE (LinearRegression) pro Fold: [32219.24417906 38112.06407757 52168.42978811 36825.80303639
 30863.91064396]
Durchschnitt: 38037.89034501786
Std-Abweichung: 7569.2724920572555


In [55]:
Y_log = np.log1p(df["SalePrice"])

Y_train_log = Y_log.loc[X_train.index]
Y_test_log = Y_log.loc[X_test.index]

Y_train_log.head(), Y_test_log.head()

(254     11.884496
 1066    12.089544
 638     11.350418
 799     12.072547
 380     11.751950
 Name: SalePrice, dtype: float64,
 892     11.947956
 1105    12.691584
 413     11.652696
 522     11.976666
 1036    12.661917
 Name: SalePrice, dtype: float64)

In [56]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

linreg_log_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", LinearRegression())
    ]
)

linreg_log_model.fit(X_train, Y_train_log)


0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [57]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Vorhersage in Log-Skala
Z_pred_lin = linreg_log_model.predict(X_test)

# Zurücktransformieren in Original-Skala
Y_pred_lin = np.expm1(Z_pred_lin)

r2_lin_log = r2_score(Y_test, Y_pred_lin)
rmse_lin_log = np.sqrt(mean_squared_error(Y_test, Y_pred_lin))

print("LinearRegression mit log-Target:")
print("R²:", r2_lin_log)
print("RMSE:", rmse_lin_log)


LinearRegression mit log-Target:
R²: 0.9322138662982564
RMSE: 22802.2314218669


In [58]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

rf_log_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        ))
    ]
)

rf_log_model.fit(X_train, Y_train_log)


0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [59]:
# Vorhersage in Log-Skala
Z_pred_rf = rf_log_model.predict(X_test)

# Zurück auf Original-Skala
Y_pred_rf = np.expm1(Z_pred_rf)

r2_rf_log = r2_score(Y_test, Y_pred_rf)
rmse_rf_log = np.sqrt(mean_squared_error(Y_test, Y_pred_rf))

print("RandomForest mit log-Target:")
print("R²:", r2_rf_log)
print("RMSE:", rmse_rf_log)


RandomForest mit log-Target:
R²: 0.8793116365721112
RMSE: 30425.63256373578


In [62]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score

linreg_log_ttr = TransformedTargetRegressor(
    regressor=linreg_model,  # deine bestehende Pipeline: preprocess + LinearRegression
    func=np.log1p,
    inverse_func=np.expm1,
)

cv_scores_lin_log_ttr = cross_val_score(
    linreg_log_ttr,
    X_train,
    Y_train,  # Achtung: Original-Ziel in Euro
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

rmse_scores_lin_log_ttr = -cv_scores_lin_log_ttr
print("CV-RMSE (Linear + log-Target via TTR, in Euro) pro Fold:", rmse_scores_lin_log_ttr)
print("Durchschnitt:", rmse_scores_lin_log_ttr.mean())
print("Std-Abweichung:", rmse_scores_lin_log_ttr.std())

CV-RMSE (Linear + log-Target via TTR, in Euro) pro Fold: [ 22133.89546155  55178.98043369 117288.47685819  29208.52906083
  21407.25676677]
Durchschnitt: 49043.42771620773
Std-Abweichung: 36266.612235278975


In [63]:
from sklearn.metrics import mean_squared_error

linreg_log_ttr.fit(X_train, Y_train)
Y_pred_ttr = linreg_log_ttr.predict(X_test)

rmse_lin_log_ttr_test = np.sqrt(mean_squared_error(Y_test, Y_pred_ttr))
rmse_lin_log_ttr_test


np.float64(22802.2314218669)

In [64]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score

gbr_base = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", HistGradientBoostingRegressor(
            max_depth=6,
            learning_rate=0.1,
            max_iter=300,
            random_state=42
        ))
    ]
)

gbr_log_ttr = TransformedTargetRegressor(
    regressor=gbr_base,
    func=np.log1p,
    inverse_func=np.expm1,
)

cv_scores_gbr_log = cross_val_score(
    gbr_log_ttr,
    X_train,
    Y_train,  # Original-Skala
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

rmse_scores_gbr_log = -cv_scores_gbr_log
print("CV-RMSE (HistGBR + log-Target, in Euro) pro Fold:", rmse_scores_gbr_log)
print("Durchschnitt:", rmse_scores_gbr_log.mean())
print("Std-Abweichung:", rmse_scores_gbr_log.std())


CV-RMSE (HistGBR + log-Target, in Euro) pro Fold: [30457.99095221 26012.37996372 32074.99532025 25626.50786545
 22448.35869042]
Durchschnitt: 27324.046558410184
Std-Abweichung: 3486.1887687521353
