In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures

# 1. Multiple Linear Regression

## Importing the Libraries


## Importing the Dataset

In [None]:
dataset = pd.read_csv('train.csv')

In [None]:
dataset.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [None]:
dataset.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## Seperating the Required Features

In [None]:
required_columns = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath', 'SalePrice']
df = dataset[required_columns]

In [None]:
df.head()

Unnamed: 0,GrLivArea,BedroomAbvGr,FullBath,HalfBath,SalePrice
0,1710,3,2,1,208500
1,1262,3,2,0,181500
2,1786,3,2,1,223500
3,1717,3,1,0,140000
4,2198,4,2,1,250000


In [None]:
df.isnull().sum()

Unnamed: 0,0
GrLivArea,0
BedroomAbvGr,0
FullBath,0
HalfBath,0
SalePrice,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   GrLivArea     1460 non-null   int64
 1   BedroomAbvGr  1460 non-null   int64
 2   FullBath      1460 non-null   int64
 3   HalfBath      1460 non-null   int64
 4   SalePrice     1460 non-null   int64
dtypes: int64(5)
memory usage: 57.2 KB


## Combining `FullBath` and `HalfBath`

In [None]:
df['TotalBathrooms'] = df['FullBath'] + 0.5*df['HalfBath']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TotalBathrooms'] = df['FullBath'] + 0.5*df['HalfBath']


In [None]:
df.head()

Unnamed: 0,GrLivArea,BedroomAbvGr,FullBath,HalfBath,SalePrice,TotalBathrooms
0,1710,3,2,1,208500,2.5
1,1262,3,2,0,181500,2.0
2,1786,3,2,1,223500,2.5
3,1717,3,1,0,140000,1.0
4,2198,4,2,1,250000,2.5


## Seperating Features and target variable

In [None]:
X = df[['GrLivArea', 'BedroomAbvGr', 'TotalBathrooms']]
y = df['SalePrice']

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X_train)

      GrLivArea  BedroomAbvGr  TotalBathrooms
618        1828             3             2.0
870         894             2             1.0
92          964             2             1.0
817        1689             3             2.0
302        1541             3             2.0
...         ...           ...             ...
763        2365             3             2.5
835        1067             2             2.0
1216       1902             4             2.0
559        1557             2             2.0
684        1839             4             2.5

[1168 rows x 3 columns]


In [None]:
print(X_test)

      GrLivArea  BedroomAbvGr  TotalBathrooms
529        2515             4             3.0
491        1578             3             1.0
459        1203             3             1.0
279        2022             4             2.5
655        1092             3             1.5
...         ...           ...             ...
326        1719             1             1.5
440        2402             2             2.0
1387       2526             5             2.0
1323        708             2             1.0
61         1111             3             1.0

[292 rows x 3 columns]


In [None]:
print(y_train)

618     314813
870     109500
92      163500
817     271000
302     205000
         ...  
763     337000
835     128000
1216    112000
559     234000
684     221000
Name: SalePrice, Length: 1168, dtype: int64


In [None]:
print(y_test)

529     200624
491     133000
459     110000
279     192000
655      88000
         ...  
326     324000
440     555000
1387    136000
1323     82500
61      101000
Name: SalePrice, Length: 292, dtype: int64


## Model Building

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

## Making Predictions

In [None]:
y_pred = lr.predict(X_test)

- RMSE measure of the average error between your predicted house prices and the actual prices, in the same units as the target variable (in this case, the price of houses). An RMSE of around 58,440 means that, on average, your model’s predictions are off by this amount.
- R-squared is a measure of how well your model explains the variance in the data. An R-squared value of 0.505 means that your model explains 50.5% of the variance in the house prices, which is a moderate but not particularly strong result.

# 2. Polynomial Regression

In [None]:
poly_features = PolynomialFeatures(degree=2)

# Transform the features
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

In [None]:
# Fit the linear regression model on the polynomial features
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Make predictions on the test set
y_pred_poly = poly_model.predict(X_test_poly)

In [None]:
# Calculating RMSE and R-squared for the Multiple regression model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

RMSE: 58440.866039084984
R-squared: 0.5054432341854288


In [None]:
# Calculating RMSE and R-squared for the polynomial regression model
poly_rmse = np.sqrt(mean_squared_error(y_test, y_pred_poly))
poly_r2 = r2_score(y_test, y_pred_poly)

print(f"Polynomial Regression RMSE: {poly_rmse}")
print(f"Polynomial Regression R-squared: {poly_r2}")

Polynomial Regression RMSE: 50459.253350045736
Polynomial Regression R-squared: 0.6313073319119697
