In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

train_path = './train.csv'
train_data = pd.read_csv(train_path)
test_path = './test.csv'
test_data = pd.read_csv(test_path)

In [3]:
# Looking at general aspects of datasets
# print(train_data.head)
for col in train_data.columns:
    print("%s: %s" %(col, train_data[col].dtypes))

Id: int64
MSSubClass: int64
MSZoning: object
LotFrontage: float64
LotArea: int64
Street: object
Alley: object
LotShape: object
LandContour: object
Utilities: object
LotConfig: object
LandSlope: object
Neighborhood: object
Condition1: object
Condition2: object
BldgType: object
HouseStyle: object
OverallQual: int64
OverallCond: int64
YearBuilt: int64
YearRemodAdd: int64
RoofStyle: object
RoofMatl: object
Exterior1st: object
Exterior2nd: object
MasVnrType: object
MasVnrArea: float64
ExterQual: object
ExterCond: object
Foundation: object
BsmtQual: object
BsmtCond: object
BsmtExposure: object
BsmtFinType1: object
BsmtFinSF1: int64
BsmtFinType2: object
BsmtFinSF2: int64
BsmtUnfSF: int64
TotalBsmtSF: int64
Heating: object
HeatingQC: object
CentralAir: object
Electrical: object
1stFlrSF: int64
2ndFlrSF: int64
LowQualFinSF: int64
GrLivArea: int64
BsmtFullBath: int64
BsmtHalfBath: int64
FullBath: int64
HalfBath: int64
BedroomAbvGr: int64
KitchenAbvGr: int64
KitchenQual: object
TotRmsAbvGrd: int6

In [4]:
# Columns with NaN's

print('-----------training data-----------')
for col in train_data.columns:
    if (train_data[col].isnull().values.any()):
        print(col, train_data[col].dtypes)
print('-----------testing data-----------')
for col in test_data.columns:
    if (test_data[col].isnull().values.any()):
        print(col, test_data[col].dtypes)

-----------training data-----------
LotFrontage float64
Alley object
MasVnrType object
MasVnrArea float64
BsmtQual object
BsmtCond object
BsmtExposure object
BsmtFinType1 object
BsmtFinType2 object
Electrical object
FireplaceQu object
GarageType object
GarageYrBlt float64
GarageFinish object
GarageQual object
GarageCond object
PoolQC object
Fence object
MiscFeature object
-----------testing data-----------
MSZoning object
LotFrontage float64
Alley object
Utilities object
Exterior1st object
Exterior2nd object
MasVnrType object
MasVnrArea float64
BsmtQual object
BsmtCond object
BsmtExposure object
BsmtFinType1 object
BsmtFinSF1 float64
BsmtFinType2 object
BsmtFinSF2 float64
BsmtUnfSF float64
TotalBsmtSF float64
BsmtFullBath float64
BsmtHalfBath float64
KitchenQual object
Functional object
FireplaceQu object
GarageType object
GarageYrBlt float64
GarageFinish object
GarageCars float64
GarageArea float64
GarageQual object
GarageCond object
PoolQC object
Fence object
MiscFeature object
SaleT

In [36]:
# Looking for trends between each variable and Sale Price
for x in train_data.columns:
    print(train_data[[x, 'SalePrice']].groupby([x], as_index=False).mean())

        Id  SalePrice
0        1     208500
1        2     181500
2        3     223500
3        4     140000
4        5     250000
5        6     143000
6        7     307000
7        8     200000
8        9     129900
9       10     118000
10      11     129500
11      12     345000
12      13     144000
13      14     279500
14      15     157000
15      16     132000
16      17     149000
17      18      90000
18      19     159000
19      20     139000
20      21     325300
21      22     139400
22      23     230000
23      24     129900
24      25     154000
25      26     256300
26      27     134800
27      28     306000
28      29     207500
29      30      68500
...    ...        ...
1430  1431     192140
1431  1432     143750
1432  1433      64500
1433  1434     186500
1434  1435     160000
1435  1436     174000
1436  1437     120500
1437  1438     394617
1438  1439     149700
1439  1440     197000
1440  1441     191000
1441  1442     149300
1442  1443     310000
1443  1444

     1stFlrSF      SalePrice
0         334   39300.000000
1         372   55000.000000
2         438   60000.000000
3         480   35311.000000
4         483   96628.571429
5         495  198500.000000
6         520  119110.400000
7         525   88000.000000
8         526   91000.000000
9         536   85000.000000
10        546   82166.666667
11        551  148800.000000
12        561  124000.000000
13        572   97950.000000
14        575  155000.000000
15        576  139000.000000
16        581  101000.000000
17        596   78000.000000
18        600   94450.000000
19        605   86000.000000
20        612  169116.500000
21        616  131186.400000
22        624  129000.000000
23        625  168300.000000
24        626  120000.000000
25        630   92944.444444
26        649   40000.000000
27        658   81000.000000
28        660  158500.000000
29        661  139000.000000
..        ...            ...
723      2073  210000.000000
724      2076  465000.000000
725      2084 

ValueError: Grouper for 'SalePrice' not 1-dimensional

In [5]:
# Imputing for NaN's
full_datasets = [train_data, test_data]

## Imputing Alley and Engineering Street+Alley Feature
street_alley_mapping = {'Grvl': 0, 'X': 0.5, 'Pave': 1}
for dataset in full_datasets:
    dataset['Alley'] = dataset['Alley'].fillna('X')
    dataset['Street'] = dataset['Street'].map(street_alley_mapping).astype(float)
    dataset['Alley'] = dataset['Alley'].map(street_alley_mapping).astype(float)
    dataset['Street+Alley'] = dataset['Street'] + dataset['Alley']

predictors = ['LotFrontage', 'Street+Alley']