In [None]:
# STEP 1 — Install scikit-learn (Colab already has it)


# STEP 2 — Import libraries
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# STEP 3 — Load Boston dataset
boston = fetch_openml(name="boston", version=1, as_frame=True)

# Separate features and target
X = boston.data
y = boston.target

# Convert target y to numeric
y = pd.to_numeric(y, errors='coerce')

# Convert all columns in X to numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Drop rows with any NaNs
df = pd.concat([X, y.rename("MEDV")], axis=1)
df = df.dropna()

# Separate back X and y
X = df.drop(columns=["MEDV"])
y = df["MEDV"]

# Confirm all numeric dtypes
print("\nFeature dtypes:\n", X.dtypes)
print("\nTarget dtype:", y.dtype)

# STEP 4 — Save dataset to CSV (OPTIONAL)
df.to_csv("boston_housing.csv", index=False)
print("\nDataset saved as boston_housing.csv")

# STEP 5 — Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# STEP 6 — Train model
model = LinearRegression()
model.fit(X_train, y_train)

# STEP 7 — Predict
y_pred = model.predict(X_test)

# STEP 8 — Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"\nMean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# STEP 9 — Show predictions
results_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
}).reset_index(drop=True)

print("\nSample predictions vs. actual values:")
print(results_df.head(10))



Feature dtypes:
 CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
dtype: object

Target dtype: float64

Dataset saved as boston_housing.csv

Mean Squared Error (MSE): 24.2911
Root Mean Squared Error (RMSE): 4.9286

Sample predictions vs. actual values:
   Actual  Predicted
0    23.6  28.996724
1    32.4  36.025565
2    13.6  14.816944
3    22.8  25.031979
4    16.1  18.769880
5    20.0  23.254429
6    17.8  17.662538
7    14.0  14.341190
8    19.6  23.013207
9    16.8  20.632456
