In [1]:
#import the libraries that are needed
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
#read the data
df=pd.read_csv("HousingData.csv")
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [3]:
type(df)

pandas.core.frame.DataFrame

In [4]:
#is there a null value?
print(df.isnull().sum())  # Count missing values in each column
print(df.isnull().sum().sum())  # Total missing values

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64
120


In [5]:
#dividing into dependent & independent variable:
X=df.iloc[:,:-1] #independent variables
y=df.iloc[:,-1] #dependent variables

#iloc: Helps to select rows, columns on the integer indices

In [6]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,


In [7]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

In [15]:
#imported linear regression model
#split the data into train & test
#cross validation: generalizes well to unseen data by testing it on multiple subsets of the data
#mse: average of the squared differences between the predicted values and the actual values.

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [16]:
#split into train & test:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
model = LinearRegression()
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [23]:
#removed nan from X using mean value!

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

imputer = SimpleImputer(strategy='mean')  # Replace NaNs with column means
X = imputer.fit_transform(X)  # Apply imputation to X

In [25]:
#split into train & test:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
#trained linear algorithm

model = LinearRegression()
model.fit(X_train, y_train)

In [29]:
#cross-validation on training data set
#simultaneously we get our mse & rmse scores!

cv_mse = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')  # 5-fold cross-validation
cv_rmse = np.sqrt(-cv_mse)
print("Cross-Validation MSE:", -cv_mse)
print(f"Cross-Validation RMSE: {cv_rmse}")
print(f"Mean CV MSE: {-cv_mse.mean():.4f}")
print(f"Mean CV RMSE: {cv_rmse.mean():.4f}")

Cross-Validation MSE: [17.27406472 35.63598898 24.78228775 23.37543187 22.71082525]
Cross-Validation RMSE: [4.15620797 5.96958868 4.97818117 4.83481456 4.76558761]
Mean CV MSE: 24.7557
Mean CV RMSE: 4.9409


In [31]:
# Make predictions on the test set

y_pred = model.predict(X_test)

In [33]:
# Calculating MSE on the test set:

mse = mean_squared_error(y_test, y_pred)
print(f"MSE on Test Set: {mse:.4f}")

MSE on Test Set: 25.0177


We observe that the mean MSE on the training set (24.7557) is very close to the mean MSE on the test set (25.0177), 
this suggests that your model is generalizing well. In other words, the model's performance on the training set is similar to its performance on the unseen test set, 
which indicates that it is neither overfitting nor underfitting the data. This is generally a good sign.

In [36]:
#r2 score & adjusted r2 score:

from sklearn.metrics import r2_score
r2_scores=r2_score(y_pred, y_test)
r2_scores

0.6269846474074956

In [38]:
X_test.shape[0]

102

In [40]:
X_test.shape[1]

13

In [42]:
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1] # Number of features
adjusted_r2 = 1 - ((1 - r2_scores) * (n - 1) / (n - p - 1))
adjusted_r2

#you might ask, why not ypred or ytest instead of X_test?
#y_test: It contains the target values, not the input features & does not provide information about the number of predictors.
#y_pred: These are just the predictions made by your model, and it is unrelated to the number of features.

0.5718801066836029

In [44]:
#Just for practise will perform Regularization (Ridge & Lasso)

#Ridge Regression:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV #(help us out doing hyperperameter tunning)

In [46]:
ridge=Ridge()
params={'alpha':[1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge_regressor=GridSearchCV(ridge, params, scoring='neg_mean_squared_error', cv=5)
ridge_regressor.fit(X_train, y_train) 
#when there's overfitting and ridge & lasso is applied, need to train X_train, y_train

In [48]:
print(ridge_regressor.best_params_) 
print(ridge_regressor.best_score_)

{'alpha': 1e-15}
-24.75571971378061


The earlier Mean CV MSE was: 24.7557 & now the ridge best score: 24.7557 are same, no overfitting!
If the ridge score is smaller than the mean cv mse, it's better!

In [51]:
#Lasso Regression:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV #(help us out doing hyperperameter tunning)

In [53]:
lasso=Lasso()
params={'alpha':[1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20]}
lasso_regressor=GridSearchCV(lasso, params, scoring='neg_mean_squared_error', cv=5)
lasso_regressor.fit(X_train, y_train) 
#when there's overfitting and ridge & lasso is applied, need to train X_train, y_train

In [55]:
print(lasso_regressor.best_params_) 
print(lasso_regressor.best_score_)

{'alpha': 1e-15}
-24.755719713780636


The earlier Mean CV MSE: 24.7557 & lasso best score: 24.7557 are same, no overfitting! If the lasso score is smaller than the mean cv mse, it's better!

Whenever you don't get the expected value of the ridge & lasso, you can try to increase the parameters, also the cv! Example:

In [59]:
lasso=Lasso()
params={'alpha':[1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 50, 60,100]}
lasso_regressor=GridSearchCV(lasso, params, scoring='neg_mean_squared_error', cv=10)
lasso_regressor.fit(X_train, y_train) 

In [61]:
print(lasso_regressor.best_params_) 
print(lasso_regressor.best_score_)

{'alpha': 1e-15}
-25.13281278085781


here tho we are instead getting more value, aim: lesser value!

In [64]:
ridge_regressor.predict(X_test)

array([ 2.91432496e+01,  3.65356675e+01,  1.44925129e+01,  2.50811104e+01,
        1.84560923e+01,  2.30104937e+01,  1.81386861e+01,  1.46265167e+01,
        2.21158938e+01,  2.08706364e+01,  2.50758640e+01,  1.87470982e+01,
       -5.67498472e+00,  2.17286655e+01,  1.90464014e+01,  2.54293420e+01,
        1.96620047e+01,  6.12420178e+00,  4.09972355e+01,  1.72352716e+01,
        2.48905259e+01,  3.02961849e+01,  1.18008779e+01,  2.29655895e+01,
        1.73451595e+01,  1.51051621e+01,  2.10901065e+01,  1.44883316e+01,
        2.30923960e+01,  1.94365043e+01,  2.25439953e+01,  2.52375663e+01,
        2.59309510e+01,  1.66289054e+01,  1.64513986e+01,  1.66143478e+01,
        3.11105960e+01,  2.02837991e+01,  2.43569451e+01,  2.26326769e+01,
        1.45257750e+01,  3.23698301e+01,  4.30428488e+01,  1.76390763e+01,
        2.76326197e+01,  1.64305143e+01,  1.42484248e+01,  2.61635938e+01,
        1.97678160e+01,  3.01463483e+01,  2.09688785e+01,  3.38338873e+01,
        1.64161397e+01,  

In [66]:
y_pred1 = lasso_regressor.predict(X_test)

In [68]:
from sklearn.metrics import r2_score
r2_scores=r2_score(y_pred1, y_test)
r2_scores

0.6269846474074874

In [70]:
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1] # Number of features
adjusted_r2 = 1 - ((1 - r2_scores) * (n - 1) / (n - p - 1))
adjusted_r2

0.5718801066835935