In [4]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Lasso, LassoCV

from utils.data import load_raw_data, load_and_clean_data, get_train_test_split
from utils.evaluation import evaluate_model, compare_models, add_result, reset_results
from utils.plotting import plot_predicted_vs_actual, plot_correlation_heatmap

plt.rcParams['figure.dpi'] = 100

# KI1-Projekt 308: California Housing — Übersicht

**Thema:** Neuronales Netz für den kalifornischen Hauspreis-Datensatz  
**Gruppennummer:** 308  
**Deadline:** 15.04.2026

## Projektstruktur

| Notebook | Inhalt | Verantwortlich |
|----------|--------|---------------|
| `01_EDA.ipynb` | Explorative Datenanalyse | Alle |
| `02_Baseline_Lineare_Regression.ipynb` | Referenzmodell | Alle |
| `03_LASSO_Ridge.ipynb` | Regularisierte Regression, Feature-Selektion | P1 |
| `04_Decision_Tree.ipynb` | Decision Tree + Pruning | P2 |
| `05_Ensemble.ipynb` | Random Forest, Gradient Boosting | P3 |
| `06_kNN_Regression.ipynb` | k-Nearest Neighbors Regression | P4 |
| `07_Neural_Network.ipynb` | NN (Kernaufgabe) + Vergleich mit LR | P5 |

## Gemeinsame Module (`utils/`)
- `data.py` — Datenladen, Cleaning, Train/Test-Split (fester random_state=42)
- `plotting.py` — Einheitliche Visualisierungen
- `evaluation.py` — R², MAE, RMSE für alle Modelle

In [5]:
# Schnelltest: Daten laden und Überblick
df = load_and_clean_data()
print(f"Bereinigter Datensatz: {df.shape[0]} Zeilen, {df.shape[1]} Spalten")
print(f"\nFeatures: {list(df.columns[:-1])}")
print(f"Zielvariable: {df.columns[-1]}")
df.describe()

Bereinigter Datensatz: 17386 Zeilen, 9 Spalten

Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Zielvariable: MedHouseVal


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,17386.0,17386.0,17386.0,17386.0,17386.0,17386.0,17386.0,17386.0,17386.0
mean,3.689281,27.460198,5.194951,1.057399,1371.727654,2.911058,35.601238,-119.5227,1.906282
std,1.547033,11.30111,1.102728,0.094509,801.373546,0.6602,2.142371,1.977985,0.959037
min,0.4999,1.0,0.846154,0.333333,3.0,0.75,32.54,-124.3,0.14999
25%,2.5388,18.0,4.426064,1.003733,817.0,2.463768,33.92,-121.62,1.156
50%,3.475,28.0,5.180993,1.045941,1194.0,2.845381,34.24,-118.47,1.734
75%,4.61855,36.0,5.933653,1.094453,1731.0,3.286999,37.67,-117.99,2.451
max,13.1477,51.0,8.335052,1.603217,4684.0,4.875,41.95,-114.55,5.0


In [6]:
# Baseline: Lineare Regression
reset_results()
X_train, X_test, y_train, y_test, feature_names = get_train_test_split(df)

lr = LinearRegression()
lr.fit(X_train, y_train)
result_lr = evaluate_model(lr, X_train, X_test, y_train, y_test, "Lineare Regression (Baseline)")
add_result(result_lr)

# Vergleichstabelle (wird mit weiteren Modellen erweitert)
compare_models()


  Lineare Regression (Baseline)
  R² Score:  Train = 0.6465  |  Test = 0.6326
  MAE:       Train = 0.4252  |  Test = 0.4341
  RMSE:      Train = 0.5710  |  Test = 0.5779


Unnamed: 0_level_0,R² Train,R² Test,MAE Train,MAE Test,RMSE Train,RMSE Test
Modell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lineare Regression (Baseline),0.646518,0.632569,0.425182,0.434125,0.571015,0.57786
