In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
print("The train columns are:")
li = train.columns.tolist()
for i in range(len(li)):
  print(li[i])

print("\n")

print("The test columns are:")
li2 = test.columns.tolist()
for i in range(len(li2)):
  print(li2[i])


The train columns are:
Unnamed: 0
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


The test columns are:
Unnamed: 0
id
date
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


In [None]:


train_df = train.drop(columns=["Unnamed: 0", "zipcode"])
test_df  = test.drop(columns=["Unnamed: 0", "zipcode", "id", "date"])


train_df['price'] = train_df["price"] / 1000
test_df['price']  = test_df["price"] / 1000

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]

X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]
scaler = StandardScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test  = scaler.transform(X_test)



In [None]:

#part 1

model = LinearRegression()
model.fit(scaled_X_train, y_train)
feat = X_train.columns.tolist()


for i in range(len(feat)):
    print(f"{feat[i]}: {model.coef_[i]}")

print("\n")
print(f"Intercept: {model.intercept_}")


# Here we predict the values based on X_test and then we calculate MSE using y_true and predictions and also calculate R^2

pred1 = model.predict(scaled_X_train)
train_mse = mean_squared_error(y_train, pred1)
train_r2  = r2_score(y_train, pred1)
print(f"Training MSE: {train_mse}")
print(f"Training R²: {train_r2}")

bedrooms: -12.521961868606958
bathrooms: 18.52763251304853
sqft_living: 56.74883680116194
sqft_lot: 10.88186844611118
floors: 8.043720836862683
waterfront: 63.74289955986957
view: 48.20010852419167
condition: 12.964269364045812
grade: 92.23147482243316
sqft_above: 48.2900888617882
sqft_basement: 27.137032467668377
yr_built: -67.64311741342645
yr_renovated: 17.2713795303447
lat: 78.37573693207814
long: -1.035203083734975
sqft_living15: 45.57765781263845
sqft_lot15: -12.930090977794924


Intercept: 520.414834000001
Training MSE: 31486.167775794882
Training R²: 0.7265334318706018


In [None]:

# Part 2

pred2 = model.predict(scaled_X_test)
test_mse = mean_squared_error(y_test, pred2)
test_r2  = r2_score(y_test, pred2)

print(f"Testing MSE:   {test_mse}")
print(f"Testing R²:    {test_r2}")

Testing MSE:   57628.154705670415
Testing R²:    0.6543560876120953


In [None]:

# Part 3


The features I believe that matter the most are the ones with largest coeffients so grade, lat, yr_built, waterfront, and sqft_living are super important

Some coefficients are negative so with increase in those features leads to decrease in price.

The R² dropped from 0.73 to 0.65 for testing which indicates that model is not able to generalize to the new testing data. There might be little overfitting.

I believe the error is large since √31486 ≈ 177 so 177k for training and √57628 ≈ 240 so 240k for testing.

From the training and testing MSE, I believe the model is overfitting since the testing error is almost twice as much as training error