In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# 1. Load dataset
df = pd.read_csv("Housing.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing values:\n", df.isnull().sum())


Dataset shape: (545, 13)
Columns: ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
Missing values:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [None]:
# 2. Preprocessing
# Convert yes/no columns to 1/0
binary_cols = ['mainroad', 'guestroom', 'basement',
               'hotwaterheating', 'airconditioning', 'prefarea']

for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# One-hot encode furnishingstatus
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

print("\nAfter preprocessing:\n", df.head())



After preprocessing:
       price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3       NaN        NaN   
1  12250000  8960         4          4        4       NaN        NaN   
2  12250000  9960         3          2        2       NaN        NaN   
3  12215000  7500         4          2        2       NaN        NaN   
4  11410000  7420         4          1        2       NaN        NaN   

   basement  hotwaterheating  airconditioning  parking  prefarea  
0       NaN              NaN              NaN        2       NaN  
1       NaN              NaN              NaN        3       NaN  
2       NaN              NaN              NaN        2       NaN  
3       NaN              NaN              NaN        3       NaN  
4       NaN              NaN              NaN        2       NaN  


In [34]:
# 3. Features and target
X_simple = df[['area']]              # simple model: only area
X_complex = df.drop('price', axis=1) # complex model: all features
y = df['price']


In [35]:
# 4. Train/test split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_simple, y, test_size=0.2, random_state=42
)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_complex, y, test_size=0.2, random_state=42
)


In [36]:
# 5. Train models
simple_model = LinearRegression().fit(X_train_s, y_train_s)
complex_model = LinearRegression().fit(X_train_c, y_train_c)

print("\nSimple model coefficient:", simple_model.coef_[0])
print("Simple model intercept:", simple_model.intercept_)
print("Complex model has", len(complex_model.coef_), "coefficients")


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values