# Project: Parameter with Highest Impact on House Prices

<hr>

- A real estate dealer wants to figure out what matters most when selling a house
- Figure out which 10 parameters (features) matter the most and present the findings

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
# import the data
data = pd.read_parquet('./data/house_sales.parquet')
data.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LandSlope,OverallQual,OverallCond,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,1,3,3,3,2,7,5,...,61,0,0,0,0,,0,2,2008,208500
2,20,80.0,9600,1,3,3,3,2,6,8,...,0,0,0,0,0,,0,5,2007,181500
3,60,68.0,11250,1,2,3,3,2,7,5,...,42,0,0,0,0,,0,9,2008,223500
4,70,60.0,9550,1,2,3,3,2,7,5,...,35,272,0,0,0,,0,2,2006,140000
5,60,84.0,14260,1,2,3,3,2,8,5,...,84,0,0,0,0,,0,12,2008,250000


In [3]:
data.shape

(1460, 56)

In [4]:
data.dtypes

MSSubClass         int64
LotFrontage      float64
LotArea            int64
Street             int64
LotShape           int64
LandContour        int64
Utilities          int64
LandSlope          int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
ExterQual          int64
ExterCond          int64
BsmtQual         float64
BsmtCond         float64
BsmtExposure     float64
BsmtFinType1     float64
BsmtFinSF1         int64
BsmtFinType2     float64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
HeatingQC          int64
CentralAir         int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual        int64
TotRmsAbvGrd       int64
Fireplaces         int64
FireplaceQu      float64


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 56 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   int64  
 4   LotShape       1460 non-null   int64  
 5   LandContour    1460 non-null   int64  
 6   Utilities      1460 non-null   int64  
 7   LandSlope      1460 non-null   int64  
 8   OverallQual    1460 non-null   int64  
 9   OverallCond    1460 non-null   int64  
 10  YearBuilt      1460 non-null   int64  
 11  YearRemodAdd   1460 non-null   int64  
 12  MasVnrArea     1452 non-null   float64
 13  ExterQual      1460 non-null   int64  
 14  ExterCond      1460 non-null   int64  
 15  BsmtQual       1423 non-null   float64
 16  BsmtCond       1423 non-null   float64
 17  BsmtExposure   1422 non-null   float64
 18  BsmtFinType1 

In [6]:
data = data.drop('PoolQC',axis=1)
data = data.fillna(-1)

**Quasi Constant Features**

In [7]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(data)

VarianceThreshold(threshold=0.01)

In [8]:
len(sel.get_feature_names_out())

53

In [9]:
quasi_features = [col for col in data.columns if col not in sel.get_feature_names_out()]
quasi_features

['Street', 'Utilities']

**Correlated Features**

```Python
(corr_matrix[feature].iloc[:corr_matrix.columns.get_loc(feature)] > 0.8).any()
```

In [10]:
corr_matrix = data.corr()

In [11]:
corr_features = [feature for feature in corr_matrix.columns if (corr_matrix[feature].iloc[:corr_matrix.columns.get_loc(feature)] > 0.8).any()]
corr_features

['1stFlrSF',
 'TotRmsAbvGrd',
 'FireplaceQu',
 'GarageArea',
 'GarageQual',
 'GarageCond']

**Prepare training and test set**

In [12]:
X = data.drop(['SalePrice']+quasi_features+corr_features,axis=1)
y = data['SalePrice']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

**Best Features for LinearRegression model**

In [14]:
sfs = SFS(LinearRegression(),k_features=10,verbose=2)
sfs.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  46 out of  46 | elapsed:    0.6s finished

[2023-08-23 08:28:24] Features: 1/10 -- score: 0.6126893627288714[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.5s finished

[2023-08-23 08:28:24] Features: 2/10 -- score: 0.7190976519156257[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  44 out of  44 | elapsed:    0.5s finished

[2023-08-23 08:28:25] Features: 3/10 -- score: 0.7711947823299818[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

SequentialFeatureSelector(estimator=LinearRegression(), k_features=(10, 10),
                          scoring='r2', verbose=2)

**Indexes of features that matters the most**

In [15]:
sfs.k_feature_idx_

(0, 2, 6, 10, 11, 15, 17, 26, 33, 36)

In [19]:
sfs.k_feature_names_

('MSSubClass',
 'LotArea',
 'OverallQual',
 'MasVnrArea',
 'ExterQual',
 'BsmtExposure',
 'BsmtFinSF1',
 'GrLivArea',
 'KitchenQual',
 'GarageCars')

### Test the result

In [20]:
lin = LinearRegression()
lin.fit(X_train,y_train)
y_pred = lin.predict(X_test)

In [21]:
r2_score(y_test,y_pred)

0.7110172392141745

In [23]:
columns = X_train.columns[list(sfs.k_feature_idx_)]
columns

Index(['MSSubClass', 'LotArea', 'OverallQual', 'MasVnrArea', 'ExterQual',
       'BsmtExposure', 'BsmtFinSF1', 'GrLivArea', 'KitchenQual', 'GarageCars'],
      dtype='object')

In [24]:
lin.fit(X_train[columns],y_train)
y_pred = lin.predict(X_test[columns])
r2_score(y_test,y_pred)

0.7239540198372474

### Test with 10 Highest Correlated Features

In [34]:
columns = corr_matrix['SalePrice'].sort_values(ascending=False)[1:11].index
columns

Index(['OverallQual', 'GrLivArea', 'ExterQual', 'KitchenQual', 'GarageCars',
       'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'BsmtQual', 'FullBath'],
      dtype='object')

In [35]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1),y,random_state=0)

lin = LinearRegression()
lin.fit(X_train[columns],y_train)
y_pred = lin.predict(X_test[columns])
r2_score(y_test,y_pred)

0.7028207002269178

#### Features that matters VS Features Correlated

* The r2_score of `features that matters` is greater than `Features co