# Part A - Regression

In [7]:
import pandas as pd
import numpy as np


In [8]:
dataset = pd.read_csv('Project housing data .csv')

In [9]:
dataset.info() # no missing value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   price          2999 non-null   int64
 1   bedrooms       2999 non-null   int64
 2   bathrooms      2999 non-null   int64
 3   sqft_living    2999 non-null   int64
 4   sqft_lot       2999 non-null   int64
 5   floors         2999 non-null   int64
 6   waterfront     2999 non-null   int64
 7   view           2999 non-null   int64
 8   condition      2999 non-null   int64
 9   sqft_above     2999 non-null   int64
 10  sqft_basement  2999 non-null   int64
 11  yr_built       2999 non-null   int64
dtypes: int64(12)
memory usage: 281.3 KB


# Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Define the full set of features
features_to_scale = [
    "bedrooms",
    "bathrooms",
    "sqft_living",
    "sqft_lot",
    "floors",
    "view",          # ordinal
    "condition",     # ordinal
    "sqft_above",
    "sqft_basement",
    "yr_built"
]

# Apply Standard Scaling
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(dataset[features_to_scale])

# # Create a new DataFrame with the scaled features
df_scaled = pd.DataFrame(scaled_features, columns=features_to_scale)

# # Optional: If you want to merge it back with the rest of the dataset
other_features = dataset.drop(columns=features_to_scale)

df_final = pd.concat([df_scaled, other_features], axis=1)

In [42]:
df_final

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,sqft_above,sqft_basement,yr_built,price,waterfront
0,0.285714,0.0,0.088117,0.003030,0.0,0.0,0.5,0.110754,0.000000,0.478261,221900,0
1,0.285714,0.2,0.273698,0.003994,0.5,0.0,0.5,0.269663,0.153846,0.443478,538000,0
2,0.142857,0.0,0.033378,0.005665,0.0,0.0,0.5,0.044944,0.000000,0.286957,180000,0
3,0.428571,0.4,0.192256,0.002636,0.0,0.0,1.0,0.089888,0.350000,0.565217,604000,0
4,0.285714,0.2,0.154873,0.004502,0.0,0.0,0.5,0.191011,0.000000,0.756522,510000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2994,0.428571,0.4,0.260347,0.002490,0.0,0.0,1.0,0.224719,0.223077,0.052174,1330000,0
2995,0.285714,0.0,0.094793,0.006998,0.0,0.0,0.5,0.118780,0.000000,0.600000,210000,0
2996,0.428571,0.2,0.177570,0.002660,0.0,0.0,0.5,0.118780,0.238462,0.260870,449000,0
2997,0.142857,0.2,0.269693,0.003901,0.5,0.0,0.5,0.329053,0.000000,0.904348,945000,0


## Encoding categorical data

✅ Needs Encoding:
waterfront – a binary categorical variable (0/1).

Already in numeric form — no encoding needed if it’s 0 and 1.

✅ You do NOT need to one-hot encode it, just leave as is.

⚠️ Ordinal Categorical Features:
view (0 to 4)

condition (1 to 5)

These are ordinal categorical features (with natural order), so:

✅ If you already use them as numeric (integers), you can leave them as-is or scale them.

❌ Do not one-hot encode unless you're using a model that can’t infer order and you believe the distance between levels is not meaningful.

## Check and Remove Outliers

In [48]:
X = df_final.iloc[:,:-2]
y = df_final.iloc[:,-2]



In [49]:
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,sqft_above,sqft_basement,yr_built
0,0.285714,0.0,0.088117,0.003030,0.0,0.0,0.5,0.110754,0.000000,0.478261
1,0.285714,0.2,0.273698,0.003994,0.5,0.0,0.5,0.269663,0.153846,0.443478
2,0.142857,0.0,0.033378,0.005665,0.0,0.0,0.5,0.044944,0.000000,0.286957
3,0.428571,0.4,0.192256,0.002636,0.0,0.0,1.0,0.089888,0.350000,0.565217
4,0.285714,0.2,0.154873,0.004502,0.0,0.0,0.5,0.191011,0.000000,0.756522
...,...,...,...,...,...,...,...,...,...,...
2994,0.428571,0.4,0.260347,0.002490,0.0,0.0,1.0,0.224719,0.223077,0.052174
2995,0.285714,0.0,0.094793,0.006998,0.0,0.0,0.5,0.118780,0.000000,0.600000
2996,0.428571,0.2,0.177570,0.002660,0.0,0.0,0.5,0.118780,0.238462,0.260870
2997,0.142857,0.2,0.269693,0.003901,0.5,0.0,0.5,0.329053,0.000000,0.904348


In [50]:
y

0        221900
1        538000
2        180000
3        604000
4        510000
         ...   
2994    1330000
2995     210000
2996     449000
2997     945000
2998     200450
Name: price, Length: 2999, dtype: int64

In [51]:
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1

# Define a mask to filter out outliers using 'and' and ensuring non-negativity
mask = ((X >= (Q1 - 1.5 * IQR)) & (X <= (Q3 + 1.5 * IQR))).all(axis=1)

# Apply the mask to remove outliers
X = X[mask]
y = y[mask]



### Creating the Training Set and the Test Set

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Building and training the model

In [53]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# train the model

model.fit(X_train, y_train)

# Making the predictions of the data points in the test set

y_pred = model.predict(X_test)


In [57]:
# Evaluation 

from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

# Adjusted R-Squared

n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
adjusted_r2

print(f"R^2 is: {r2} and Adjusted R^2 is: {adjusted_r2}")

R^2 is: 0.4529416151961737 and Adjusted R^2 is: 0.4406757321288234


In [58]:
# more evaluations

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")


Mean Squared Error (MSE): 27209505684.527267
Mean Absolute Error (MAE): 126070.24772218369


In [59]:
# make a prediction

# Define a sample input with invented feature values
sample_input = {
    'bedrooms': 0.3,
    'bathrooms': 0.2,
    'sqft_living': 0.1,
    'sqft_lot': 0.004,
    'floors': 0.5,
    'view': 0.0,
    'condition': 0.5,
    'sqft_above': 0.12,
    'sqft_basement': 0.15,
    'yr_built': 0.5
}

# Convert the sample input to a DataFrame
sample_df = pd.DataFrame([sample_input])

# Make a prediction using the trained model
predicted_price = model.predict(sample_df)

print(f"Predicted price for the sample input is: {predicted_price[0]}")



Predicted price for the sample input is: 420717.0976310569
