In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import confusion_matrix, accuracy_score

In [55]:
dataset = pd.read_csv('Dataset/X_train.csv')
dataset.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,12.247694,0.235294,0.0,0.0,0.366344,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.5,0.0,0.0
1,2,12.109011,0.0,0.0,0.0,0.391317,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.25,0.0,0.0
2,3,12.317167,0.235294,0.0,0.0,0.422359,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.727273,0.5,0.0,0.0
3,4,11.849398,0.294118,0.0,0.0,0.390295,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0
4,5,12.429216,0.235294,0.0,0.0,0.468761,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0


## Dividing the dataset

In [28]:
features = dataset.drop(columns=['Id', 'SalePrice'])
labels = dataset['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)
print(X_train.shape)
X_train.head()

(1168, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
139,0.235294,0.0,0.0,0.484144,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.636364,0.75,0.0,0.0
878,0.382353,0.0,0.0,0.431402,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.025806,0.454545,1.0,0.0,0.0
949,0.0,0.0,0.0,0.386362,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,1.0,0.0,0.0
392,0.0,0.0,0.0,0.363756,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.077419,0.545455,0.25,0.0,0.0
972,0.588235,0.0,0.0,0.352973,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.272727,1.0,0.0,0.0


# Feature Selection

In [31]:
model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [33]:
model.get_support()

array([ True, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True,  True, False, False,  True, False, False, False, False,
       False,  True, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [42]:
# Let's print the number of total and selected model
selected_features = X_train.columns[model.get_support()]

# Let's print some stats
print('Total features: ', len(X_train.columns))
print('Selected features: ', len(selected_features))
print('No. of features whose coeff shrank to 0: ',sum(model.estimator_.coef_ == 0) )

Total features:  79
Selected features:  12
No. of features whose coeff shrank to 0:  67


In [45]:
selected_features

Index(['MSSubClass', 'LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd',
       '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'HalfBath', 'Fireplaces',
       'GarageCars', 'WoodDeckSF'],
      dtype='object')

In [46]:
X_train = X_train[selected_features]
X_test  = X_test[selected_features]

print(X_train.shape)
X_train.head()

(1168, 12)


Unnamed: 0,MSSubClass,LotArea,OverallQual,YearBuilt,YearRemodAdd,1stFlrSF,GrLivArea,BsmtFullBath,HalfBath,Fireplaces,GarageCars,WoodDeckSF
139,0.235294,0.484144,0.555556,0.905797,0.783333,0.386718,0.588711,0.333333,0.5,0.0,0.5,0.322054
878,0.382353,0.431402,0.444444,0.644928,0.75,0.469528,0.438903,0.333333,0.0,0.0,0.5,0.224037
949,0.0,0.386362,0.555556,0.724638,0.933333,0.537157,0.502121,0.333333,0.5,0.333333,0.5,0.0
392,0.0,0.363756,0.444444,0.630435,0.15,0.367478,0.34351,0.0,0.0,0.0,0.25,0.0
972,0.588235,0.352973,0.555556,0.775362,0.483333,0.382617,0.357661,0.0,0.0,0.333333,0.25,0.032672


# Linear Regression

In [63]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_pred = linear_model.predict(X_test)

## Decision Tree Regressor

In [64]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

dt_pred = dt_model.predict(X_test)