In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
dfTrain = pd.read_csv('data/train.csv')

In [3]:
tNullRows = dfTrain.isnull().sum().sort_values(ascending = False)
percent = (dfTrain.isnull().sum() / len(dfTrain)).sort_values(ascending = False)
missing_data = pd.concat([tNullRows, percent], axis = 1, keys = ['Total','Percent'])
missing_data[missing_data['Total']>0]

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageYrBlt,81,0.055479
GarageCond,81,0.055479
GarageType,81,0.055479
GarageFinish,81,0.055479


In [4]:
dfTrain.dropna(axis=1, how='any', inplace=True)

In [5]:
dfTrain.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,0.0,0.0,1474.0
BsmtUnfSF,1460.0,567.240411,441.866955,0.0,223.0,477.5,808.0,2336.0


<h3>Implementing LARS algorithm</h3>

We will implement de LARS algoritm for predictin SalePrice

<u>Replacing namedfields by numeric values</u>

In [6]:
ncols = len(dfTrain.columns)
nrows = len(dfTrain)

In [7]:
dfTrain[dfTrain['MSZoning'] == 'C']

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


In [8]:
dfTrain.groupby(['MSZoning']).count()

Unnamed: 0_level_0,Id,MSSubClass,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
MSZoning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C (all),10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
FV,65,65,65,65,65,65,65,65,65,65,...,65,65,65,65,65,65,65,65,65,65
RH,16,16,16,16,16,16,16,16,16,16,...,16,16,16,16,16,16,16,16,16,16
RL,1151,1151,1151,1151,1151,1151,1151,1151,1151,1151,...,1151,1151,1151,1151,1151,1151,1151,1151,1151,1151
RM,218,218,218,218,218,218,218,218,218,218,...,218,218,218,218,218,218,218,218,218,218


In [9]:
dfT_zoning_FV = dfTrain[dfTrain['MSZoning'] == 'FV']
dfT_zoning_FV

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
47,48,20,FV,11096,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,7,2007,WD,Normal,249700
56,57,160,FV,2645,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2009,WD,Abnorml,172500
87,88,160,FV,3951,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,6,2009,New,Partial,164500
105,106,60,FV,9375,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2008,WD,Normal,250000
115,116,160,FV,3230,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,6,2007,WD,Normal,176000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,1365,160,FV,3180,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,4,2006,WD,Abnorml,144152
1365,1366,60,FV,7500,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,1,2010,WD,Normal,216000
1374,1375,60,FV,10625,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,7,2008,WD,Normal,250000
1442,1443,60,FV,11003,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,4,2009,WD,Normal,310000


In [10]:
dfTrain.dtypes

Id                int64
MSSubClass        int64
MSZoning         object
LotArea           int64
Street           object
                  ...  
MoSold            int64
YrSold            int64
SaleType         object
SaleCondition    object
SalePrice         int64
Length: 62, dtype: object

In [11]:
dfTrain.query('MSZoning == "FV"')

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
47,48,20,FV,11096,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,7,2007,WD,Normal,249700
56,57,160,FV,2645,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2009,WD,Abnorml,172500
87,88,160,FV,3951,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,6,2009,New,Partial,164500
105,106,60,FV,9375,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2008,WD,Normal,250000
115,116,160,FV,3230,Pave,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,6,2007,WD,Normal,176000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,1365,160,FV,3180,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,4,2006,WD,Abnorml,144152
1365,1366,60,FV,7500,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,1,2010,WD,Normal,216000
1374,1375,60,FV,10625,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,7,2008,WD,Normal,250000
1442,1443,60,FV,11003,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,4,2009,WD,Normal,310000


In [52]:

cols = dfTrain.select_dtypes(include='object').columns
#np.arange(len(cols))
for i in np.arange(len(cols)):
    #cols_list = dfTrain[cols[i]].groupby(by=cols[i]).count()
    cols_list = list(dfTrain[[cols[i]]].groupby(by=cols[i]).count().index)
    print(cols[i],':',cols_list)

MSZoning : ['C (all)', 'FV', 'RH', 'RL', 'RM']
Street : ['Grvl', 'Pave']
LotShape : ['IR1', 'IR2', 'IR3', 'Reg']
LandContour : ['Bnk', 'HLS', 'Low', 'Lvl']
Utilities : ['AllPub', 'NoSeWa']
LotConfig : ['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside']
LandSlope : ['Gtl', 'Mod', 'Sev']
Neighborhood : ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker']
Condition1 : ['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe', 'RRNn']
Condition2 : ['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn']
BldgType : ['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE']
HouseStyle : ['1.5Fin', '1.5Unf', '1Story', '2.5Fin', '2.5Unf', '2Story', 'SFoyer', 'SLvl']
RoofStyle : ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed']
RoofMatl : ['ClyTile', 'CompShg', 'Membran', 'Met

In [68]:
dfTrain.query("MSZoning == 'FV'").describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,...,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0
mean,707.030769,90.461538,6638.861538,7.2,5.030769,2004.507692,2004.830769,327.892308,0.0,663.123077,...,57.276923,103.476923,0.0,0.0,3.046154,0.0,0.0,6.276923,2007.846154,214014.061538
std,402.406199,55.689731,3155.973354,0.794512,0.174036,3.405524,3.352095,374.056944,0.0,422.816764,...,81.755449,73.715693,0.0,0.0,24.558877,0.0,0.0,2.775494,1.325599,52369.662067
min,48.0,20.0,2117.0,6.0,5.0,1997.0,1998.0,0.0,0.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,144152.0
25%,382.0,60.0,3316.0,7.0,5.0,2003.0,2003.0,0.0,0.0,319.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,174000.0
50%,687.0,60.0,7500.0,7.0,5.0,2005.0,2006.0,222.0,0.0,600.0,...,0.0,102.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,205950.0
75%,978.0,160.0,9000.0,8.0,5.0,2007.0,2007.0,578.0,0.0,912.0,...,144.0,146.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,250000.0
max,1455.0,160.0,12552.0,10.0,6.0,2009.0,2009.0,1238.0,0.0,1632.0,...,216.0,364.0,0.0,0.0,198.0,0.0,0.0,12.0,2010.0,370878.0
