# Regression Project

In [1]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 1 -  Data Ingestion

In [2]:
import pandas as pd

df = pd.read_csv("training_set (1).csv", na_values=["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# step 2 - Perform Basic data Quality chechs

In [3]:
m = df.isna().sum()
m

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.shape

(1460, 81)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Step 3 - Seperate X and Y (SalePrice)

In [7]:
X = df.drop(columns=["Id", "SalePrice"])
Y = df[["SalePrice"]]

In [8]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [9]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


# Step 4 - Preprocess data for Feature Selection

In [10]:
cat = list(X.columns[X.dtypes == "object"])
con = list(X.columns[X.dtypes != "object"])

In [11]:
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [12]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [14]:
num_pipe1 = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

In [15]:
cat_pipe1 = make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder())

In [16]:
pre1 = ColumnTransformer([("num", num_pipe1, con), ("cat", cat_pipe1, cat)]).set_output(
    transform="pandas"
)

In [17]:
X_pre = pre1.fit_transform(X)
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


In [18]:
pre1

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


# Step 5 - Apply Feature selection

In [19]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

base_model = LinearRegression()
sel = SequentialFeatureSelector(
    estimator=base_model, n_features_to_select="auto", direction="backward", n_jobs=-1
)
sel.fit(X_pre, Y)
imp_cols = sel.get_feature_names_out()
print(imp_cols)

['num__MSSubClass' 'num__LotArea' 'num__OverallQual' 'num__OverallCond'
 'num__YearBuilt' 'num__MasVnrArea' 'num__BsmtFinSF1' 'num__BsmtFinSF2'
 'num__BsmtUnfSF' 'num__TotalBsmtSF' 'num__1stFlrSF' 'num__2ndFlrSF'
 'num__LowQualFinSF' 'num__GrLivArea' 'num__BsmtFullBath'
 'num__KitchenAbvGr' 'num__TotRmsAbvGrd' 'num__Fireplaces'
 'num__GarageCars' 'num__WoodDeckSF' 'num__OpenPorchSF' 'num__ScreenPorch'
 'num__PoolArea' 'num__YrSold' 'cat__LandContour' 'cat__Utilities'
 'cat__Neighborhood' 'cat__BldgType' 'cat__HouseStyle' 'cat__RoofMatl'
 'cat__Exterior1st' 'cat__MasVnrType' 'cat__ExterQual' 'cat__BsmtQual'
 'cat__BsmtCond' 'cat__BsmtExposure' 'cat__HeatingQC' 'cat__KitchenQual'
 'cat__Functional' 'cat__GarageCond']


In [20]:
len(imp_cols)

40

In [21]:
imp_cols[0]

'num__MSSubClass'

In [22]:
imp_cols[0].split("__")[1]

'MSSubClass'

In [23]:
sel_cols = [col.split("__")[1] for col in imp_cols]
print(sel_cols)

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'PoolArea', 'YrSold', 'LandContour', 'Utilities', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'GarageCond']


In [24]:
X_sel = X[sel_cols]
X_sel.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,Exterior1st,MasVnrType,ExterQual,BsmtQual,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,GarageCond
0,60,8450,7,5,2003,196.0,706,0,150,856,...,VinylSd,BrkFace,Gd,Gd,TA,No,Ex,Gd,Typ,TA
1,20,9600,6,8,1976,0.0,978,0,284,1262,...,MetalSd,,TA,Gd,TA,Gd,Ex,TA,Typ,TA
2,60,11250,7,5,2001,162.0,486,0,434,920,...,VinylSd,BrkFace,Gd,Gd,TA,Mn,Ex,Gd,Typ,TA
3,70,9550,7,5,1915,0.0,216,0,540,756,...,Wd Sdng,,TA,TA,Gd,No,Gd,Gd,Typ,TA
4,60,14260,8,5,2000,350.0,655,0,490,1145,...,VinylSd,BrkFace,Gd,Gd,TA,Av,Ex,Gd,Typ,TA


# Step 6 - Apply final preprocessing on selected feature

In [25]:
cat_sel = list(X_sel.columns[X_sel.dtypes == "object"])
con_sel = list(X_sel.columns[X_sel.dtypes != "object"])

In [26]:
cat_sel

['LandContour',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'GarageCond']

In [27]:
con_sel

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
num_pipe2 = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

In [30]:
cat_pipe2 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
)

In [31]:
pre2 = ColumnTransformer(
    [("num", num_pipe2, con_sel), ("cat", cat_pipe2, cat_sel)]
).set_output(transform="pandas")

In [32]:
X_sel_pre = pre2.fit_transform(X_sel)
X_sel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Functional_Maj2,cat__Functional_Min1,cat__Functional_Min2,cat__Functional_Mod,cat__Functional_Sev,cat__Functional_Typ,cat__GarageCond_Fa,cat__GarageCond_Gd,cat__GarageCond_Po,cat__GarageCond_TA
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.511418,0.575425,-0.288653,-0.944591,-0.459303,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.57441,1.171992,-0.288653,-0.641228,0.466465,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.32306,0.092907,-0.288653,-0.301643,-0.313369,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.57441,-0.499274,-0.288653,-0.06167,-0.687324,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,1.36457,0.463568,-0.288653,-0.174865,0.19968,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


# Step 7 - Train Test Split

In [33]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X_sel_pre, Y, test_size=0.2, random_state=21
)

In [34]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Functional_Maj2,cat__Functional_Min1,cat__Functional_Min2,cat__Functional_Mod,cat__Functional_Sev,cat__Functional_Typ,cat__GarageCond_Fa,cat__GarageCond_Gd,cat__GarageCond_Po,cat__GarageCond_TA
710,-0.636078,-0.640101,-2.241782,0.381743,-1.201217,-0.57441,-0.973018,-0.288653,-0.672923,-1.795509,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1098,-0.163109,-0.452686,-1.518467,0.381743,-1.168096,-0.57441,0.500854,-0.288653,-1.284176,-0.878862,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1286,-0.872563,-0.072844,-0.071836,-0.5172,-0.273836,1.924104,0.274948,0.213629,0.250749,0.616959,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
992,0.073375,-0.075851,-0.071836,2.179628,-0.240715,1.30917,0.20257,0.436865,-0.901577,-0.53683,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
631,1.492282,-0.593999,1.374795,-0.5172,1.150356,0.023903,-0.92038,-0.288653,2.179592,1.132288,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [35]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [36]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Functional_Maj2,cat__Functional_Min1,cat__Functional_Min2,cat__Functional_Mod,cat__Functional_Sev,cat__Functional_Typ,cat__GarageCond_Fa,cat__GarageCond_Gd,cat__GarageCond_Po,cat__GarageCond_TA
880,-0.872563,-0.350058,-0.795151,-0.5172,1.117235,-0.57441,1.176379,-0.288653,-1.035147,0.074268,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
605,0.073375,0.309002,0.651479,0.381743,-0.207594,0.40062,0.022723,-0.288653,-0.573311,-0.659961,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1166,-0.872563,-0.004192,1.374795,-0.5172,1.216598,-0.175535,-0.973018,-0.288653,2.550871,1.451518,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
216,-0.872563,-0.207142,0.651479,-0.5172,1.084115,0.899214,1.101808,-0.288653,-0.174865,0.863222,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
970,-0.163109,0.02838,-1.518467,-1.416142,-0.737526,-0.57441,-0.973018,-0.288653,0.345832,-0.769412,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [37]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


In [38]:
xtrain.shape

(1168, 116)

In [39]:
xtest.shape

(292, 116)

# Step 8 - On Selected features apply ridge or lasso and slecte best model

In [40]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(xtrain, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [41]:
model1.score(xtrain, ytrain)

0.9241460232427011

In [42]:
model1.score(xtest, ytest)

0.817165013005153

# Ridge model

In [43]:
from sklearn.linear_model import Ridge

model2 = Ridge(alpha=0.1)
model2.fit(xtrain, ytrain)

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [44]:
model2.score(xtrain, ytrain)

0.9132835801717545

In [45]:
model2.score(xtest, ytest)

0.839329976246584

In [46]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model2, xtrain, ytrain, cv=5, scoring="r2")
scores

array([0.61706312, 0.83499391, 0.90380043, 0.86478793, 0.73320524])

In [47]:
scores.mean()

np.float64(0.7907701264815149)

# Hyperparameter tuning for Ridge

In [48]:
import numpy as np

In [49]:
params = {"alpha": np.arange(start=1, stop=70, step=0.5)}

In [50]:
params

{'alpha': array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,
         6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. , 10.5, 11. , 11.5,
        12. , 12.5, 13. , 13.5, 14. , 14.5, 15. , 15.5, 16. , 16.5, 17. ,
        17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5, 21. , 21.5, 22. , 22.5,
        23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. , 27.5, 28. ,
        28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 32. , 32.5, 33. , 33.5,
        34. , 34.5, 35. , 35.5, 36. , 36.5, 37. , 37.5, 38. , 38.5, 39. ,
        39.5, 40. , 40.5, 41. , 41.5, 42. , 42.5, 43. , 43.5, 44. , 44.5,
        45. , 45.5, 46. , 46.5, 47. , 47.5, 48. , 48.5, 49. , 49.5, 50. ,
        50.5, 51. , 51.5, 52. , 52.5, 53. , 53.5, 54. , 54.5, 55. , 55.5,
        56. , 56.5, 57. , 57.5, 58. , 58.5, 59. , 59.5, 60. , 60.5, 61. ,
        61.5, 62. , 62.5, 63. , 63.5, 64. , 64.5, 65. , 65.5, 66. , 66.5,
        67. , 67.5, 68. , 68.5, 69. , 69.5])}

In [51]:
from sklearn.model_selection import GridSearchCV

ridge1 = Ridge()
gscv_ridge = GridSearchCV(ridge1, param_grid=params, cv=5, scoring="r2")
gscv_ridge.fit(xtrain, ytrain)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': array([ 1. , ..., 69. , 69.5])}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(8.5)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [52]:
gscv_ridge.best_params_

{'alpha': np.float64(8.5)}

In [53]:
gscv_ridge.best_score_

np.float64(0.8361504309446086)

In [54]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,np.float64(8.5)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [55]:
best_ridge.score(xtrain, ytrain)

0.8856157387042425

In [56]:
best_ridge.score(xtest, ytest)

0.8366023213613373

# Lasso Model

In [57]:
from sklearn.linear_model import Lasso

model3 = Lasso(alpha=0.1)
model3.fit(xtrain, ytrain)

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [58]:
model3.score(xtrain, ytrain)

0.9241457476329562

In [59]:
model3.score(xtest, ytest)

0.817335493900285

# Hyperparameter tunning on Lasso

In [60]:
params

{'alpha': array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,
         6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. , 10.5, 11. , 11.5,
        12. , 12.5, 13. , 13.5, 14. , 14.5, 15. , 15.5, 16. , 16.5, 17. ,
        17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5, 21. , 21.5, 22. , 22.5,
        23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. , 27.5, 28. ,
        28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 32. , 32.5, 33. , 33.5,
        34. , 34.5, 35. , 35.5, 36. , 36.5, 37. , 37.5, 38. , 38.5, 39. ,
        39.5, 40. , 40.5, 41. , 41.5, 42. , 42.5, 43. , 43.5, 44. , 44.5,
        45. , 45.5, 46. , 46.5, 47. , 47.5, 48. , 48.5, 49. , 49.5, 50. ,
        50.5, 51. , 51.5, 52. , 52.5, 53. , 53.5, 54. , 54.5, 55. , 55.5,
        56. , 56.5, 57. , 57.5, 58. , 58.5, 59. , 59.5, 60. , 60.5, 61. ,
        61.5, 62. , 62.5, 63. , 63.5, 64. , 64.5, 65. , 65.5, 66. , 66.5,
        67. , 67.5, 68. , 68.5, 69. , 69.5])}

In [61]:
lasso1 = Lasso()
gscv_lasso = GridSearchCV(lasso1, param_grid=params, cv=5, scoring="r2")
gscv_lasso.fit(xtrain, ytrain)

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': array([ 1. , ..., 69. , 69.5])}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(69.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [62]:
gscv_lasso.best_params_

{'alpha': np.float64(69.5)}

In [63]:
gscv_lasso.best_score_

np.float64(0.8313418661853706)

In [64]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

0,1,2
,alpha,np.float64(69.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [65]:
best_lasso.score(xtrain, ytrain)

0.8872223945857197

In [66]:
best_lasso.score(xtest, ytest)

0.8453181295343664

# from above results i can say that Lasso modell is best here because it has highest R2 score in test and cross validation

# Step 9 - Evauate best model in detail

In [67]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
)

In [68]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)

    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [69]:
print("Train Results Lasso :")
evaluate_model(best_lasso, xtrain, ytrain)

Train Results Lasso :
RMSE : 26419.16
MAE : 16381.60
MAPE : 9.72%
R2 : 88.72%


In [70]:
print("Test Results  Ridge :")
evaluate_model(best_lasso, xtest, ytest)

Test Results  Ridge :
RMSE : 32377.88
MAE : 17442.70
MAPE : 10.02%
R2 : 84.53%


# Step 10 - Out of  sample prediction

In [71]:
xnew = pd.read_csv("testing_set.csv", na_values=["", "NA"], keep_default_na=False)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [72]:
xnew.to_csv("LassoResults.csv", index=False)

# Step 11 - Save the model object with preprocessor

In [73]:
pre1

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [74]:
best_lasso

0,1,2
,alpha,np.float64(69.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [75]:
import joblib

joblib.dump(pre1, "pre.joblib")

['pre.joblib']

In [76]:
joblib.dump(best_lasso, "lasso_traning.joblib")

['lasso_traning.joblib']

# Loading the model object

In [77]:
p = joblib.load("pre.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [78]:
m = joblib.load("lasso_traning.joblib")
m

0,1,2
,alpha,np.float64(69.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [79]:
m.score(xtrain, ytrain)

0.8872223945857197

In [80]:
m.score(xtest, ytest)

0.8453181295343664