In [1]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr

In [2]:
# Loading train set
dataset = pd.read_csv("house-prices/house-prices.csv")

**Type of Parameters**

In [3]:
# Note: the paramaters Order and PID are not taken into account in the analysis

# Nominal parameters:
nominal = ["MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
           "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st",
           "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", "Misc Feature",
           "Sale Type", "Sale Condition"]

# Continuous parameters:
continuous = ["Lot Frontage", "Lot Area", "Mas Vnr Area", "BsmtFin SF 1", "BsmtFin SF 2", "Bsmt Unf SF",
             "Total Bsmt SF", "1st Flr SF", "2nd Flr SF", "Low Qual Fin SF", "Gr Liv Area", "Garage Area",
             "Wood Deck SF", "Open Porch SF", "Enclosed Porch", "3Ssn Porch", "Screen Porch", "Pool Area",
             "Misc Val"]

# Ordinal parameters:
ordinal = ["Lot Shape", "Utilities", "Land Slope", "Overall Qual", "Overall Cond", "Exter Qual", "Exter Cond",
           "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC", "Electrical",
           "Kitchen Qual", "Functional", "Fireplace Qu", "Garage Finish", "Garage Qual", "Garage Cond", 
           "Paved Drive", "Pool QC", "Fence"]

# Discrete parameters:
discrete = ["Year Built", "Year Remod/Add", "Bsmt Full Bath", "Bsmt Half Bath", "Full Bath", "Half Bath", 
            "Bedroom AbvGr", "Kitchen AbvGr", "TotRms AbvGrd", "Fireplaces", "Garage Yr Blt", "Garage Cars", 
            "Mo Sold", "Yr Sold"]

# Target parameter:
target = ["SalePrice"]

**Continuous Parameters**

In [4]:
# Compute Pearson coefficient for each continuous variable vs sale price:
output = np.asarray(dataset[target[0]])

pearson_coeff = {}
for var in continuous:
    # Extract the continuous paramater of interest
    continuous_parameters = np.asarray(dataset[var])
    
    # Get the values which are NaN
    nan_coeff = np.isnan(continuous_parameters)

    # Compute the pearson coeff without taking into account the nan value
    pearson_coeff[var] = pearsonr(continuous_parameters[~nan_coeff], output[~nan_coeff])[0]

print("The Pearson coefficient for each continuous parameters are:")
print(pearson_coeff)

# The relevant continuous parameters used for the regression task:
new_continuous = ["Gr Liv Area", "Garage Area", "Total Bsmt SF", "1st Flr SF", "Mas Vnr Area"]

The Pearson coefficient for each continuous parameters are:
{'Lot Frontage': 0.35813292440566213, 'Lot Area': 0.25244436059514225, 'Mas Vnr Area': 0.5093858691575662, 'BsmtFin SF 1': 0.43959065920896323, 'BsmtFin SF 2': 0.013065786327905312, 'Bsmt Unf SF': 0.18010456330689728, 'Total Bsmt SF': 0.6353291657470064, '1st Flr SF': 0.6258820813508563, '2nd Flr SF': 0.2450139525509315, 'Low Qual Fin SF': -0.029989265927516145, 'Gr Liv Area': 0.700147252597573, 'Garage Area': 0.630807230801987, 'Wood Deck SF': 0.3155211374525229, 'Open Porch SF': 0.301212393535366, 'Enclosed Porch': -0.13828227620583708, '3Ssn Porch': 0.039016705617244574, 'Screen Porch': 0.1200085513384342, 'Pool Area': 0.07213842328655988, 'Misc Val': -0.016803512647847997}


By analyzing the Pearson coefficents we see that the continuous parameters which are highly correlated to the sale price are: 'Gr Liv Area', 'Garage Area', 'Total Bsmt SF', '1st Flr SF' and 'Mas Vnr Area'.

**Discrete Parameters**

In [5]:
# Compute Pearson coefficient for each discrete variable vs sale price:
output = np.asarray(dataset[target[0]]) # = sale price

pearson_coeff = {}
for var in discrete:
    discrete_parameters = np.asarray(dataset[var])
    
    nan_coeff = np.isnan(discrete_parameters)

    pearson_coeff[var] = pearsonr(discrete_parameters[~nan_coeff], output[~nan_coeff])[0]

print("The Pearson coefficient for each discrete parameters are:")
print(pearson_coeff)

new_discrete = ["Year Built", "Year Remod/Add", "Full Bath", "TotRms AbvGrd", "Fireplaces"]

The Pearson coefficient for each discrete parameters are:
{'Year Built': 0.5598232924974971, 'Year Remod/Add': 0.5282431291931846, 'Bsmt Full Bath': 0.27854521144590294, 'Bsmt Half Bath': -0.026989905035923266, 'Full Bath': 0.5366156607926562, 'Half Bath': 0.2798279836308183, 'Bedroom AbvGr': 0.13300659692329586, 'Kitchen AbvGr': -0.11995437901738167, 'TotRms AbvGrd': 0.4838818753641341, 'Fireplaces': 0.49205645886436455, 'Garage Yr Blt': 0.530062179386795, 'Garage Cars': 0.643467333923074, 'Mo Sold': 0.03573410776717603, 'Yr Sold': -0.026385922422978502}


By analyzing the Pearson coefficents we see that the discrete parameters which are highly correlated to the sale price are: 'Year Built', 'Year Remod/Add', 'Full Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt' and 'Garage Cars'. However, 'Garage Yr Blt' and 'Year Built' are highly correlated, therefore we drop 'Garage Yr Blt'. If we compute also the Pearson coefficient between 'Garage Cars' (discrete) and 'Garage Area' (continous), we see that they are highly correlated. Therefore, we decided to drop 'Garage Cars'.

In [6]:
# Parameters of interest 1
year_built = np.asarray(dataset['Year Built'])
year_garage = np.asarray(dataset['Garage Yr Blt'])

nan_coeff = np.isnan(year_garage)

coeff_built_garage_vs_house = pearsonr(year_built[~nan_coeff], year_garage[~nan_coeff])[0]
print("Pearson coeff between Year Built and Garage Yr Blt:", coeff_built_garage_vs_house)

# Parameters of interest 2
garage_cars = np.asarray(dataset['Garage Cars'])
garage_area = np.asarray(dataset['Garage Area'])

nan_coeff = np.isnan(garage_area)

coeff_garage_cars_area = pearsonr(garage_cars[~nan_coeff], garage_area[~nan_coeff])[0]
print("Pearson coeff between Garage Cars and Garage Area:", coeff_garage_cars_area)

Pearson coeff between Year Built and Garage Yr Blt: 0.8355102326795171
Pearson coeff between Garage Cars and Garage Area: 0.8898551615513899


**Nominal Parameters**

In [7]:
for var in nominal:
    print("=====================================")
    print(var)
    
    # Set the missing values of var to None:
    dataset[var].fillna('None',inplace = True)
    nominal_parameters = np.asarray(dataset[var])
        
    # Contain the values that each samples can take
    unique_list = np.unique(nominal_parameters)
    
    # For each specific value that can be taken by var,
    # plot the number of occurences
    clusters = []
    target_values = []
    i = 0
    for value in unique_list:
        index_value = np.where(nominal_parameters == value)[0]
        corresponding_sale_price = list(dataset[target[0]][index_value])
        clusters.extend(corresponding_sale_price)
        
        print(value, index_value.shape[0])


MS SubClass
20 912
30 119
40 4
45 15
50 231
60 470
70 107
75 19
80 94
85 41
90 86
120 159
150 1
160 103
180 14
190 55
MS Zoning
A (agr) 2
C (all) 17
FV 113
I (all) 2
RH 19
RL 1894
RM 383
Street
Grvl 11
Pave 2419
Alley
Grvl 104
None 2267
Pave 59
Land Contour
Bnk 102
HLS 98
Low 54
Lvl 2176
Lot Config
Corner 416
CulDSac 157
FR2 72
FR3 11
Inside 1774
Neighborhood
Blmngtn 21
Blueste 6
BrDale 26
BrkSide 89
ClearCr 39
CollgCr 226
Crawfor 79
Edwards 159
Gilbert 135
Greens 8
GrnHill 2
IDOTRR 75
Landmrk 1
MeadowV 28
Mitchel 99
NAmes 361
NPkVill 21
NWAmes 121
NoRidge 53
NridgHt 134
OldTown 209
SWISU 37
Sawyer 128
SawyerW 102
Somerst 151
StoneBr 40
Timber 60
Veenker 20
Condition 1
Artery 78
Feedr 134
Norm 2086
PosA 17
PosN 35
RRAe 24
RRAn 43
RRNe 5
RRNn 8
Condition 2
Artery 5
Feedr 12
Norm 2403
PosA 4
PosN 2
RRAe 1
RRAn 1
RRNn 2
Bldg Type
1Fam 2016
2fmCon 56
Duplex 86
Twnhs 88
TwnhsE 184
House Style
1.5Fin 257
1.5Unf 16
1Story 1244
2.5Fin 7
2.5Unf 21
2Story 712
SFoyer 70
SLvl 103
Roof Style
Flat 1

**Ordinal parameters**

In [8]:
for var in ordinal:
    print("=====================================")
    print(var)    
    
    # Set the missing values of var to None:
    dataset[var].fillna('None',inplace = True)
    ordinal_parameters = np.asarray(dataset[var])
        
    # Contain the values that each samples can take
    unique_list = np.unique(ordinal_parameters)
      
    # For each specific value that can be taken by var,
    # plot the number of occurences        
    clusters = []
    target_values = []
    i = 0
    for value in unique_list:
        index_value = np.where(ordinal_parameters == value)[0]
        corresponding_sale_price = list(dataset[target[0]][index_value])
        clusters.extend(corresponding_sale_price)
        
        print(value, index_value.shape[0])

Lot Shape
IR1 836
IR2 66
IR3 14
Reg 1514
Utilities
AllPub 2427
NoSeWa 1
NoSewr 2
Land Slope
Gtl 2312
Mod 102
Sev 16
Overall Qual
1 4
2 12
3 35
4 181
5 690
6 608
7 503
8 277
9 96
10 24
Overall Cond
1 6
2 10
3 41
4 78
5 1367
6 448
7 322
8 125
9 33
Exter Qual
Ex 87
Fa 31
Gd 816
TA 1496
Exter Cond
Ex 12
Fa 54
Gd 251
Po 3
TA 2110
Bsmt Qual
Ex 213
Fa 75
Gd 1008
None 71
Po 1
TA 1062
Bsmt Cond
Ex 2
Fa 82
Gd 101
None 71
Po 4
TA 2170
Bsmt Exposure
Av 350
Gd 235
Mn 208
No 1563
None 74
BsmtFin Type 1
ALQ 352
BLQ 227
GLQ 711
LwQ 124
None 71
Rec 242
Unf 703
BsmtFin Type 2
ALQ 45
BLQ 61
GLQ 28
LwQ 75
None 72
Rec 84
Unf 2065
Heating QC
Ex 1229
Fa 81
Gd 401
Po 2
TA 717
Electrical
FuseA 160
FuseF 38
FuseP 7
Mix 1
None 1
SBrkr 2223
Kitchen Qual
Ex 177
Fa 58
Gd 957
Po 1
TA 1237
Functional
Maj1 18
Maj2 7
Min1 56
Min2 53
Mod 30
Sal 1
Sev 2
Typ 2263
Fireplace Qu
Ex 38
Fa 67
Gd 620
None 1186
Po 42
TA 477
Garage Finish
Fin 596
None 138
RFn 665
Unf 1031
Garage Qual
Ex 2
Fa 109
Gd 20
None 138
Po 5
TA 2156
Garage