# A comparison of MSE, MAE, R2-score for all the five Linear Regression algorithms based on cuML - Linear Regression library for doing house price prediction.



In [1]:
import cudf as cd
import cupy as cp
import matplotlib.pyplot as plt
from cuml.model_selection import train_test_split
from cuml.preprocessing import LabelEncoder
from cuml.preprocessing import MinMaxScaler
import cuml

In [2]:
train_data=cd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
train_data.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Id             1460 non-null   int64
 1   MSSubClass     1460 non-null   int64
 2   MSZoning       1460 non-null   object
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64
 5   Street         1460 non-null   object
 6   Alley          91 non-null     object
 7   LotShape       1460 non-null   object
 8   LandContour    1460 non-null   object
 9   Utilities      1460 non-null   object
 10  LotConfig      1460 non-null   object
 11  LandSlope      1460 non-null   object
 12  Neighborhood   1460 non-null   object
 13  Condition1     1460 non-null   object
 14  Condition2     1460 non-null   object
 15  BldgType       1460 non-null   object
 16  HouseStyle     1460 non-null   object
 17  OverallQual    1460 non-null   int64
 18  OverallCond    1460 non-null   

In [4]:
train_data.reset_index(drop=True,inplace=True)

# Handling Missing data

In [5]:
features_with_na = [features for features in train_data.columns if train_data[features].isnull().sum() > 1]

for feature in features_with_na:
    if(cp.round(train_data[feature].isnull().mean(), 3)*100>45):
        print(feature,cp.round(train_data[feature].isnull().mean(), 3)*100 , ' % missing values')
print("\n")
for feature in features_with_na:
    if(cp.round(train_data[feature].isnull().mean(), 3)*100<45):
        print(feature,cp.round(train_data[feature].isnull().mean(), 3)*100 , ' % missing values')

Alley 93.8  % missing values
FireplaceQu 47.3  % missing values
PoolQC 99.5  % missing values
Fence 80.80000000000001  % missing values
MiscFeature 96.3  % missing values


LotFrontage 17.7  % missing values
MasVnrType 0.5  % missing values
MasVnrArea 0.5  % missing values
BsmtQual 2.5  % missing values
BsmtCond 2.5  % missing values
BsmtExposure 2.6  % missing values
BsmtFinType1 2.5  % missing values
BsmtFinType2 2.6  % missing values
GarageType 5.5  % missing values
GarageYrBlt 5.5  % missing values
GarageFinish 5.5  % missing values
GarageQual 5.5  % missing values
GarageCond 5.5  % missing values


## Handling Temporal variables (Datetime, year etc)

In [6]:
# list of variables that  contain year information 
year_features = [feature for feature in train_data.columns if 'Yr' in feature or 'Year' in feature]
train_data[year_features].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt,YrSold
0,2003,2003,2003.0,2008
1,1976,1976,1976.0,2007
2,2001,2002,2001.0,2008
3,1915,1970,1998.0,2006
4,2000,2000,2000.0,2008


In [7]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    train_data[feature] = train_data['YrSold'] - train_data[feature]
train_data[year_features]

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt,YrSold
0,5,5,5.0,2008
1,31,31,31.0,2007
2,7,6,7.0,2008
3,91,36,8.0,2006
4,8,8,8.0,2008
...,...,...,...,...
1455,8,7,8.0,2007
1456,32,22,32.0,2010
1457,69,4,69.0,2010
1458,60,14,60.0,2010


## Handling Numerical Features

In [8]:
numerical_data=train_data.select_dtypes(exclude=['object'])
numerical_features=list(numerical_data.columns)
numerical_features

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [9]:
numerical_features_nan = [feature for feature in train_data.columns if train_data[feature].isnull().sum() > 0 and train_data[feature].dtype != 'O']
print(numerical_features_nan)

for feature in numerical_features_nan:
    train_data[feature] = train_data[feature].fillna(train_data[feature].median())

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


In [10]:
num_continuous_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']
for feature in num_continuous_features:
    train_data[feature] = cp.log(train_data[feature])

In [11]:
train_data[numerical_features]=MinMaxScaler().fit_transform(train_data[numerical_features])
train_data[numerical_features].head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,0.0,0.235294,0.418208,0.366344,0.666667,0.5,0.036765,0.098361,0.1225,0.125089,...,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,0.090909,0.5,0.581431
1,0.000685,0.0,0.495064,0.391317,0.555556,0.875,0.227941,0.52459,0.0,0.173281,...,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.25,0.536319
2,0.001371,0.235294,0.434909,0.422359,0.666667,0.5,0.051471,0.114754,0.10125,0.086109,...,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,0.727273,0.5,0.604029
3,0.002056,0.294118,0.388581,0.390295,0.666667,0.5,0.669118,0.606557,0.0,0.038271,...,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0,0.451871
4,0.002742,0.235294,0.513123,0.468761,0.777778,0.5,0.058824,0.147541,0.21875,0.116052,...,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.640477


In [12]:
train_data.drop(['Id'],axis=1,inplace=True)

# Handling Categorical Features

In [13]:
categorical_data=train_data.select_dtypes('object')
categorical_features=list(categorical_data.columns)
train_data[categorical_features].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [14]:
categorical_features_nan = [feature for feature in train_data.columns if train_data[feature].isnull().sum() > 0 and train_data[feature].dtype == 'O']

for feature in categorical_features_nan:
    print(f"{feature}: {cp.round(train_data[feature].isnull().mean(),3)*100}% missing values")

Alley: 93.8% missing values
MasVnrType: 0.5% missing values
BsmtQual: 2.5% missing values
BsmtCond: 2.5% missing values
BsmtExposure: 2.6% missing values
BsmtFinType1: 2.5% missing values
BsmtFinType2: 2.6% missing values
Electrical: 0.1% missing values
FireplaceQu: 47.3% missing values
GarageType: 5.5% missing values
GarageFinish: 5.5% missing values
GarageQual: 5.5% missing values
GarageCond: 5.5% missing values
PoolQC: 99.5% missing values
Fence: 80.80000000000001% missing values
MiscFeature: 96.3% missing values


In [15]:
train_data[categorical_features] = train_data[categorical_features].fillna('Missing')
train_data[categorical_features].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Abnorml
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal


In [16]:
le = LabelEncoder()
for feature in categorical_data:
    train_data[feature] = le.fit_transform(train_data[feature])
train_data[categorical_features].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,3,1,1,3,3,0,4,0,5,2,...,1,2,5,5,2,3,2,1,8,4
1,3,1,1,3,3,0,2,0,24,1,...,1,2,5,5,2,3,2,1,8,4
2,3,1,1,0,3,0,4,0,5,2,...,1,2,5,5,2,3,2,1,8,4
3,3,1,1,0,3,0,0,0,6,2,...,5,3,5,5,2,3,2,1,8,0
4,3,1,1,0,3,0,2,0,15,2,...,1,2,5,5,2,3,2,1,8,4


In [17]:
train_data = train_data[["OverallQual","YearBuilt","YearRemodAdd","ExterQual","TotalBsmtSF","1stFlrSF","GrLivArea","FullBath","TotRmsAbvGrd","GarageCars","GarageArea",
                   "MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope","SalePrice"]]

In [18]:
train_data.head()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,ExterQual,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,MSZoning,Utilities,BldgType,Heating,KitchenQual,SaleCondition,LandSlope,SalePrice
0,0.666667,0.036765,0.098361,2,0.140098,0.356155,0.577712,0.666667,0.5,0.5,0.38646,3,0,0,1,2,4,0,0.581431
1,0.555556,0.227941,0.52459,3,0.206547,0.503056,0.470245,0.666667,0.333333,0.5,0.324401,3,0,0,1,3,4,0,0.536319
2,0.666667,0.051471,0.114754,2,0.150573,0.383441,0.593095,0.666667,0.333333,0.5,0.428773,3,0,0,1,2,4,0,0.604029
3,0.666667,0.669118,0.606557,3,0.123732,0.399941,0.579157,0.333333,0.416667,0.75,0.45275,3,0,0,1,2,0,0,0.451871
4,0.777778,0.058824,0.147541,2,0.187398,0.466237,0.666523,0.666667,0.583333,0.75,0.589563,3,0,0,1,2,4,0,0.640477


In [19]:
train_data.shape

(1460, 19)

# Splitting the train data into Input and Output variables

In [20]:
X = train_data.drop('SalePrice',axis=1)
Y = train_data['SalePrice']

In [21]:
print(X.shape)
Y.shape

(1460, 18)


(1460,)

In [22]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.21,random_state=42)

In [23]:
x_train.head()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,ExterQual,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,MSZoning,Utilities,BldgType,Heating,KitchenQual,SaleCondition,LandSlope
81,0.555556,0.058824,0.147541,3,0.202455,0.524903,0.490667,0.666667,0.25,0.5,0.285614,4,0,4,1,3,4,0
886,0.444444,0.345588,0.032787,3,0.266121,0.618465,0.578126,0.666667,0.5,0.5,0.414669,3,0,2,1,3,3,0
685,0.666667,0.169118,0.393443,2,0.194763,0.480825,0.6487,0.666667,0.333333,0.5,0.406911,3,0,4,1,2,4,0
420,0.666667,0.080882,0.180328,3,0.219967,0.526879,0.492514,0.666667,0.5,1.0,0.552891,4,0,2,1,3,2,0
883,0.333333,0.691176,0.934426,2,0.130115,0.397174,0.671636,0.333333,0.666667,0.25,0.310296,3,0,0,1,3,4,0


Comparison of Linear Regression methods

In [24]:
%%time
import matplotlib.pyplot as plt
list=[]

for algo in ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']:
    lr=cuml.LinearRegression(fit_intercept=True, normalize = False, algorithm = algo)
    regressor=lr.fit(x_train,y_train)
    predicted=lr.predict(x_test)
#   print(regressor.coef_)
#   print(regressor.intercept)
    
    mse=cuml.metrics.regression.mean_squared_error(y_test,predicted)
    mae=cuml.metrics.regression.mean_absolute_error(y_test,predicted)
    r2=cuml.metrics.regression.r2_score(y_test,predicted)
    print("Algo--->",algo)
    print("MSE:",mse)
    print("MAE:",mae)
    print("R2:",r2)
    print("\n")

Algo---> svd
MSE: 0.002536535291449078
MAE: 0.035930598861395796
R2: 0.8495867345053698


Algo---> eig
MSE: 0.0025365352914490835
MAE: 0.03593059886139578
R2: 0.8495867345053696


Algo---> qr
MSE: 0.002536535291449078
MAE: 0.03593059886139581
R2: 0.8495867345053698


Algo---> svd-qr
MSE: 0.01747542398702463
MAE: 0.10084921486379764
R2: -0.036270063599204816


Algo---> svd-jacobi
MSE: 0.017475423987024626
MAE: 0.10084921486379762
R2: -0.036270063599204816


CPU times: user 551 ms, sys: 191 ms, total: 742 ms
Wall time: 736 ms
