In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [9]:
data = pd.read_csv("data/Housing.csv")
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 490 entries, 0 to 505
Data columns (total 14 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   per_capita_crime_rate                                                490 non-null    float64
 1   proportion_of_residential_land_over_25000_sq.ft.                     490 non-null    float64
 2   proportion_of_non-retail_business_acres_per_town                     490 non-null    float64
 3   Charles_River_dummy_variable_(1_if_tract_bounds_river;_0_otherwise)  490 non-null    float64
 4   nitric_oxides_concentration_(parts_per_10_million)                   490 non-null    float64
 5   average_number_of_rooms_per_dwelling                                 490 non-null    float64
 6   proportion_of_owner-occupied_units_built_prior_to_1940               490 non-null    float64
 7   weighted

In [7]:
feature_cols = data.columns.values.tolist()[:-1]
X = data[feature_cols]
y = data[data.columns.values.tolist()[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# rmse
print("RMSE (ERROR IN PREDICTION: Preferred value: <10): ", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE (ERROR IN PREDICTION: Preferred value: <10):  4.772467263566135


# Applying PCA

In [18]:
# standardize x before applying pca
scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)

# Create PCA object 
# define number of components after reduction
n_components=5
pca = PCA(n_components=n_components)

# Fit and Apply dimensionality reduction on X
# apply pca only on training data and then fit the same on testing and cv data
X_train_pc = pca.fit_transform(X_train_std)

# The amount of variance that each PC explains
var = pca.explained_variance_ratio_
# find total variance explained by the given number of components
# take total number of components which can explain 99% variance
total_variance = sum(var)
print("Total variance explained by {0} features is: {1}".format(n_components, round(total_variance*100, 2)))

model = LinearRegression()
model.fit(X_train_pc, y_train)

X_test_std = scaler.transform(X_test)
X_test_pc = pca.transform(X_test_std)
y_pred = model.predict(X_test_pc)

# rmse
print("RMSE (ERROR IN PREDICTION: Preferred value: <10): ", np.sqrt(mean_squared_error(y_test, y_pred)))


Total variance explained by 5 features is: 80.67
RMSE (ERROR IN PREDICTION: Preferred value: <10):  4.810705153989695
