In [33]:

# Cell 1 - Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import os

print("Libraries imported. Python version:", pd.__version__)


Libraries imported. Python version: 2.2.3


In [34]:
# Cell 2 - File path (you said the file is here)
csv_path = '/kaggle/input/boston-house-prices/housing.csv'
print("Looking for:", csv_path)
print("Files in /kaggle/input:")
print(os.listdir('/kaggle/input'))


Looking for: /kaggle/input/boston-house-prices/housing.csv
Files in /kaggle/input:
['agents-intensive-capstone-project', 'boston-house-prices']


In [35]:
# Cell 3 - Load data (robust: tries comma, then whitespace)
def load_boston_csv(path):
    # Try normal CSV read first
    try:
        df = pd.read_csv(path)
        if df.shape[1] > 1:
            print("Read with pd.read_csv() : shape", df.shape)
            return df
    except Exception as e:
        print("pd.read_csv failed:", e)
    # Try delim_whitespace (some versions have space-separated values)
    try:
        df = pd.read_csv(path, delim_whitespace=True, header=None)
        if df.shape[1] > 1:
            print("Read with delim_whitespace and header=None : shape", df.shape)
            return df
    except Exception as e:
        print("delim_whitespace read failed:", e)
    # Try engine='python' auto-detect
    try:
        df = pd.read_csv(path, sep=None, engine='python')
        print("Read with sep=None engine=python : shape", df.shape)
        return df
    except Exception as e:
        print("All read attempts failed:", e)
        raise FileNotFoundError(f"Couldn't read dataset at {path}. Make sure file exists and is CSV or whitespace-separated.")

df = load_boston_csv(csv_path)
print("Initial df shape:", df.shape)
display(df.head())
print("\nColumns in the dataset:")
print(df.columns)


Read with delim_whitespace and header=None : shape (506, 14)
Initial df shape: (506, 14)


  df = pd.read_csv(path, delim_whitespace=True, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2



Columns in the dataset:
Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')


In [36]:
# Cell 4 - If dataframe has headerless columns, fix for classic Boston dataset
boston_colnames = [
    "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX",
    "PTRATIO","B","LSTAT","MEDV"
]

# If df has one column but that column contains whitespace-separated data, try splitting:
if df.shape[1] == 1:
    # split strings into many columns
    df = df[ df.columns[0] ].str.split(expand=True)
    print("Split the single column into", df.shape[1], "columns.")

# If number of columns matches Boston features, set names:
if df.shape[1] == len(boston_colnames):
    df.columns = boston_colnames
    print("Assigned standard Boston column names.")
else:
    print("Number of columns:", df.shape[1])
    print("Column names (first 10):", list(df.columns[:10]))

# Convert possible object columns to numeric where possible
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nAfter conversion, dtypes:")
print(df.dtypes)

# Select target column safely:
if 'MEDV' in df.columns:
    target_col = 'MEDV'
else:
    numeric_cols = df.select_dtypes(include=['int64','float64']).columns
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns found. Your CSV may not be parsed correctly. Check the file format.")
    target_col = numeric_cols[-1]   # last numeric column as fallback

print("Using target column:", target_col)
display(df.head())


Assigned standard Boston column names.

After conversion, dtypes:
CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object
Using target column: MEDV


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [37]:
# Cell 5 - Train a simple model
# Drop rows with missing values in target or features (simple handling)
df_clean = df.dropna(axis=0, subset=[target_col])
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]

# If X has non-numeric or object columns, keep numeric only for now
X = X.select_dtypes(include=['int64','float64'])

print("Final training shapes: X:", X.shape, "y:", y.shape)
if X.shape[0] == 0 or X.shape[1] == 0:
    raise ValueError("No training data after cleaning. Check dataset content.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model trained. MSE: {mse:.4f}, R2: {r2:.4f}")


Final training shapes: X: (506, 13) y: (506,)
Model trained. MSE: 24.2911, R2: 0.6688


In [38]:
# Cell 6 - Save predictions file to notebook working directory
out = X_test.copy()
out['actual'] = y_test.values
out['predicted'] = y_pred
out.to_csv('predictions.csv', index=False)
print("Saved predictions.csv to notebook working directory.")


Saved predictions.csv to notebook working directory.
