In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
df = pd.read_csv("dataset.csv")
df.head()


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,YearBuilt,Fireplaces,TotRmsAbvGrd,KitchenQual,ExterQual,BsmtQual,GarageFinish,OpenPorchSF,SalePrice
0,7,1710,2,548,856,856,2,2003,0,8,Gd,Gd,Gd,RFn,61,208500
1,6,1262,2,460,1262,1262,2,1976,1,6,TA,TA,Gd,RFn,0,181500
2,7,1786,2,608,920,920,2,2001,1,6,Gd,Gd,Gd,RFn,42,223500
3,7,1717,3,642,756,961,1,1915,1,7,Gd,TA,TA,Unf,35,140000
4,8,2198,3,836,1145,1145,2,2000,1,9,Gd,Gd,Gd,RFn,84,250000


In [8]:
df.isnull().sum()

OverallQual     0
GrLivArea       0
GarageCars      0
GarageArea      0
TotalBsmtSF     0
1stFlrSF        0
FullBath        0
YearBuilt       0
Fireplaces      0
TotRmsAbvGrd    0
KitchenQual     0
ExterQual       0
BsmtQual        0
GarageFinish    0
OpenPorchSF     0
SalePrice       0
dtype: int64

In [7]:
df["BsmtQual"].fillna(df["BsmtQual"].mode()[0], inplace=True)
df["GarageFinish"].fillna(df["GarageFinish"].mode()[0], inplace=True)

In [9]:
# Select the 8 important features and the target
features = [
    "OverallQual", "GrLivArea", "GarageCars", "TotalBsmtSF",
    "FullBath", "YearBuilt", "KitchenQual", "Fireplaces"
]
target = "SalePrice"

X = df[features]
y = df[target]

In [10]:
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
categorical_cols = ["KitchenQual"]
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_cols] = encoder.fit_transform(X[categorical_cols])


In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred


array([129458.2770875 , 296942.03293061, 124768.19915353, 182347.61427509,
       311035.5047385 ,  57897.89682358, 222857.76837326, 169907.2105909 ,
        57246.44344966, 121256.41469207, 154861.04873132, 102960.30274744,
        93401.01319585, 216598.42126534, 189089.59028302, 122889.89142919,
       221161.71302111, 115127.33374475, 122010.91255728, 212423.51427655,
       169168.39160783, 220378.70119832, 182942.34191891, 103336.60295651,
       208494.86442401, 175933.7351136 , 205112.253151  ,  96804.15470937,
       184256.9738982 , 215635.78102569, 108336.93739833, 254018.96799362,
       247188.17712391,  89049.93643411, 264571.86988517, 144828.66480349,
       150815.65162029, 230403.41917829, 294546.80165307,  81702.49208977,
       145157.45571747, 247847.84294101, 109143.79937913, 323792.1523325 ,
       123561.04928391, 181635.50565103, 104363.05167802, 107844.20027229,
       359419.54821977, 129357.83220743, 102508.25824227, 215058.17900846,
       122808.17766362, 2

In [13]:
# Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [15]:
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2*100:.2f}%")

Mean Squared Error: 1443233316.056314
R² Score: 81.18%
