In [1]:
import pandas as pd
import numpy as np

# Lasso and Ridge Regression with HousePrice.csv

In [2]:
price = pd.read_csv('HousePrice.csv')
price.drop('Id', axis = 1, inplace = True)
price.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
price.shape

(1460, 80)

In [4]:
drop_cols = [col for col in price.columns if price[col].isna().sum() > 0]  # dropping columns that have null values 
price.drop(drop_cols, axis = 1, inplace = True)

In [5]:
price.shape

(1460, 61)

Removed columns with null values.   
Since this dataset has a lot of columns we can affort to lose some columns

In [6]:
price.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics

In [8]:
price.dtypes

MSSubClass        int64
MSZoning         object
LotArea           int64
Street           object
LotShape         object
                  ...  
MoSold            int64
YrSold            int64
SaleType         object
SaleCondition    object
SalePrice         int64
Length: 61, dtype: object

Select datatypes that have numberical data

In [9]:
cols = price.select_dtypes(include = ['int64']).columns  # taking all numerical data for regression
X = price[cols]
y = X.pop('SalePrice')

In [10]:
X.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,8450,7,5,2003,2003,706,0,150,856,...,548,0,61,0,0,0,0,0,2,2008
1,20,9600,6,8,1976,1976,978,0,284,1262,...,460,298,0,0,0,0,0,0,5,2007
2,60,11250,7,5,2001,2002,486,0,434,920,...,608,0,42,0,0,0,0,0,9,2008
3,70,9550,7,5,1915,1970,216,0,540,756,...,642,0,35,272,0,0,0,0,2,2006
4,60,14260,8,5,2000,2000,655,0,490,1145,...,836,192,84,0,0,0,0,0,12,2008


In [11]:
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [13]:
scaler = StandardScaler()  # Scale the data with Standard Scaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Lasso Regression

In [14]:
lasso = Lasso(max_iter = 2000)
lasso.fit(X_train_scaled, y_train)
y_pred = lasso.predict(X_test_scaled)
y_pred

array([154050.93125605, 309019.85725523, 116084.33354034, 180717.34647793,
       303622.61655701,  44171.03255432, 226537.02854791, 150369.36516987,
        41521.21601869, 151797.40135617, 157264.88368486, 115436.02871859,
        81338.81026785, 210560.178927  , 192353.38350748, 143468.13280926,
       214957.16412957, 137142.7606616 , 119753.78269971, 232390.83138792,
       187181.80335298, 216457.67631856, 194255.44882388, 134434.54497577,
       213461.91339301, 150872.19608364, 200383.18941472,  91543.013772  ,
       186245.5056802 , 181008.95992682, 114531.23954862, 272648.15603162,
       232128.39322826,  87060.1871785 , 271026.58647776, 162974.0048765 ,
       150861.79975873, 219546.11058451, 306014.8615101 ,  92796.31853887,
       134124.90821538, 253106.19398521, 104284.02013523, 272261.52561315,
       131215.15635779, 130858.34755376, 105444.71896444, 128634.33099592,
       357140.92471805, 128460.71478248, 105909.1488557 , 219778.77638931,
        90747.5990831 , 3

In [15]:
metrics.r2_score(y_test, y_pred)

0.8163510621717289

In [16]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))  # rmse

35798.3171009838

## Ridge Regression

In [17]:
ridge = Ridge(max_iter = 2000)
ridge.fit(X_train_scaled, y_train)
y_pred = ridge.predict(X_test_scaled)
y_pred

array([154019.50770115, 308994.24969353, 116058.94339476, 180705.21105951,
       303568.48207392,  44200.49378919, 226469.23929722, 150463.36729277,
        41552.27319983, 151767.45861935, 157196.92277879, 115414.63028116,
        81558.1066539 , 210541.5489815 , 192382.78615157, 143450.13726382,
       214931.90922466, 137124.55422785, 119753.02710984, 232369.85905787,
       187217.54225572, 216441.12245504, 194263.03713956, 134412.67404894,
       213470.76759081, 150928.54665642, 200341.72471854,  91521.39197489,
       186280.77923129, 181047.21273483, 114576.37281616, 272652.70410586,
       232116.31653501,  87051.35543572, 271019.73282187, 162975.68108246,
       150843.40915974, 219530.13149001, 305986.78646683,  92710.13540522,
       134129.7045385 , 253070.72271812, 104286.72633794, 272454.10110861,
       131225.71747814, 130886.78945771, 105419.81337191, 128632.70006898,
       357074.43936556, 128460.20265871, 105889.49547978, 219855.82500058,
        90820.11453868, 3

In [18]:
metrics.r2_score(y_test, y_pred)

0.8163826266403943

In [19]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))  # rmse

35795.240569672496

In this dataset, Ridge Regression performed very slightly better than Lasso for the same number of iterations and same train test split.  
Both these models are equally significant.