In [21]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
data = pd.read_csv('C:/Users/anton/Downloads/kunskapskontroll_ai2_del1/kunskapskontroll_ai2_del1/housing.csv')
data = data.dropna()

In [3]:
numerical_features = data.select_dtypes(include=['float64']).columns.tolist()
numerical_features.remove('median_house_value')  # Exkludera målvariabeln
cat_feat = ['ocean_proximity']

In [4]:
numeric_transformer_lr = StandardScaler()
cat_transformer_lr = OneHotEncoder(drop='first')

numeric_transformer_other = StandardScaler()
ordinal_categories = [['INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND', '<1H OCEAN']]
cat_transformer_other = OrdinalEncoder(categories=ordinal_categories)

In [5]:
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_lr, numerical_features),
        ('cat', cat_transformer_lr, cat_feat)
    ])

preprocessor_other = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_other, numerical_features),
        ('cat', cat_transformer_other, cat_feat)
    ])

In [6]:
models = {
    'Linear Regression': Pipeline(steps=[('preprocessor', preprocessor_lr),
                                         ('regressor', LinearRegression())]),
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor_other),
                                     ('regressor', RandomForestRegressor(random_state=42))]),
    'Lasso': Pipeline(steps=[('preprocessor', preprocessor_other),
                             ('regressor', Lasso())])
}

In [7]:
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

Training Linear Regression...
Training Random Forest...
Training Lasso...


In [16]:
cv_results = {}
for model_name, model in models.items():
    print(f"Cross-validating {model_name}...")
    scores = cross_validate(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')["test_score"]
    cv_results[model_name] = scores

Cross-validating Linear Regression...
Cross-validating Random Forest...
Cross-validating Lasso...


In [17]:
for model_name, scores in cv_results.items():
    rmse_scores = np.sqrt(-scores)
    print(f'RMSE for {model_name} for each iteration:', rmse_scores)
    print(f'Average RMSE for {model_name}:', np.sqrt(np.mean(-scores)))
    print("-" * 30)

RMSE for Linear Regression for each iteration: [67201.92123224 68526.7478616  70291.64606743]
Average RMSE for Linear Regression: 68685.10002579454
------------------------------
RMSE for Random Forest for each iteration: [49526.24794902 49564.91082278 50899.61587987]
Average RMSE for Random Forest: 50001.00171495633
------------------------------
RMSE for Lasso for each iteration: [67571.77104213 69147.66837149 70814.12825487]
Average RMSE for Lasso: 69190.52209534706
------------------------------


In [18]:
forest_model = models['Random Forest']
forest_model.fit(X_train, y_train)

In [19]:
y_test_pred_rf = forest_model.predict(X_test)

In [22]:
RMSE_test_data = mean_squared_error(y_test, y_test_pred_rf, squared=False)
print("RMSE for test data:", RMSE_test_data)

RMSE for test data: 48756.829813239936


In [23]:
RMSE_ratio = RMSE_test_data / np.mean(y_test)
print("RMSE ratio:", RMSE_ratio)

RMSE ratio: 0.23469628897959807
