In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from multiprocessing import Pool
from operator import itemgetter
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import mixture
import matplotlib.pyplot as plt
from sklearn.mixture import GMM
import math
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPRegressor
import pdb

# Load some data
full_train = pd.read_csv('train.csv', squeeze=True)
test = pd.read_csv('test.csv', squeeze=True)
sample_submission = pd.read_csv('sample_submission.csv', squeeze=True)

In [6]:
# Exploratory Data Analysis
print(full_train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [10]:
full_train.apply(lambda col: col.isnull()).sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
LotFrontage       259
GarageCond         81
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageQual         81
BsmtExposure       38
BsmtFinType2       38
BsmtFinType1       37
BsmtCond           37
BsmtQual           37
MasVnrArea          8
MasVnrType          8
Electrical          1
Utilities           0
YearRemodAdd        0
MSSubClass          0
Foundation          0
ExterCond           0
ExterQual           0
Exterior2nd         0
Exterior1st         0
RoofMatl            0
RoofStyle           0
YearBuilt           0
                 ... 
GarageArea          0
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
3SsnPorch           0
BsmtUnfSF           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
Functional          0
TotRmsAbvGrd        0
KitchenQua

In [None]:
# Categorical NA
naCounts = full_train.isna().sum()
naVals = naCounts[naCounts>0].sort_values(ascending=False)
naVals

In [11]:
# NA Columns we are choosing to remove
naCols = [
"PoolQC",
"MiscFeature",
"Alley",
"Fence",
"FireplaceQu",
"LotFrontage",
"GarageYrBlt",
"GarageType",
"GarageFinish",
"GarageQual",
"GarageCond",
]

In [15]:
# Selecting Low Variety Categorical Columns
currentCounts = {}
for col in full_train.columns:
    currentCounts[col] = []
    for val in full_train[col]:
        if val not in currentCounts[col]:
            currentCounts[col].append(val)
for k in currentCounts:
    currentCounts[k] = len(currentCounts[k])
#print(np.sort(currentCounts.values()))
gen_columns = []
for k in currentCounts:
    if currentCounts[k] <= 1000:
        gen_columns.append(k)
gen_columns

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',

In [16]:
s_fields = [ c for c in [
    'MSZoning',
    'Street',
    'Alley',
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'OverallQual',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'Heating',
    'HeatingQC',
    'CentralAir',
    'Electrical',
    'Electrical',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PavedDrive',
    'PoolQC',
    'Fence',
    'MiscFeature',
    'SaleType',
    'MSSubClass',
    'SaleCondition'
] if not c in naCols and c in gen_columns]

n_fields = [c for c in [
    'LotFrontage',
    'LotArea',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'YrSold',
    'MoSold',
    'MiscVal',
] if not c in naCols and c in gen_columns]

In [22]:
# Remove outliers greater than 3 standard deviations
new_train = full_train.copy()
to_drop = []
# new_train[np.abs(new_train - new_train.mean()) <= (3 * new_train.std())]
for r_idx, row in full_train.iterrows():
    for c_idx, val in enumerate(row):
        c_name = full_train.columns[c_idx]
        if c_name in n_fields:
            c_mean = full_train[c_name].mean()
            val_diff = np.abs(val - c_mean)
            c_std = full_train[c_name].std()
            if val_diff >= 3 * c_std:
                to_drop.append(r_idx)

train = new_train.drop(new_train.index[to_drop])
train.shape

(1056, 81)

In [23]:
# Set target and predictors
target = 'SalePrice'
predictors = [c for c in train.columns if not c == target and not c in naCols and c in gen_columns]

# Train/test split
X = train[predictors]
y = train[[target]]
X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=1)
X_test = test[predictors]

In [136]:
def make_csv(name, pred):
    new_df = test.copy()
    new_df['SalePrice'] = pred
    new_df[['Id', 'SalePrice']].to_csv(name, index=False)

In [24]:
def ensure_data_type(X):
    return X.apply(lambda col: col.astype(str))

def make_dictionaries(X):
    return X.to_dict(orient='records')

def select_categorical_features(X):
    return X[s_fields]

def select_numeric_features(X):
    return X[n_fields]

def log_num_fields(X):
    sf_fields = []
    for idx, field in enumerate(n_fields):
        if 'sf' in field.lower():
            sf_fields.append(field)
    X[sf_fields] = np.log(X[sf_fields])
    return X

numeric_selector = FunctionTransformer(select_numeric_features, validate=False)
imp = Imputer(strategy='mean')
lnf = FunctionTransformer(log_num_fields, validate=False)

scf = FunctionTransformer(select_categorical_features, validate=False)
ed = FunctionTransformer(ensure_data_type, validate=False)
md = FunctionTransformer(make_dictionaries, validate=False)
fh = FeatureHasher(n_features=2000)


numeric_feature_pipeline = Pipeline(steps=[
    ('select', numeric_selector),
    ('log_fields', lnf),
    ('impute', imp),
])

categorical_pipeline = Pipeline(steps=[
    ('select', scf),
    ('datatype', ed),
    ('dictionaries', md),
    ('vectorize', fh)
])
fu = FeatureUnion([
    ('categorical', categorical_pipeline),
    ('numeric', numeric_feature_pipeline),
])
fr = Pipeline([
    ('union', fu),
    ('reduction', TruncatedSVD(n_components=400)),
])
def root_mean_square_error(pred, actual):
    log_actual = np.log(actual)
    log_pred = np.log(pred)
    return np.sqrt(mean_squared_error(log_actual, log_pred))

In [92]:
# Ridge Log Sales Price
rid = Ridge()
search_params = {
    'preprocess__union__categorical__vectorize__n_features': [2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [400],
    'predict__alpha': [10.0],
    'predict__fit_intercept': [False],
    'predict__solver': ["cholesky"],
    'predict__random_state': [1],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', rid)
])

round_num = 2
grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train).round(round_num))
print(grid_search.best_params_)
print(grid_search.score(X_dev, np.log(y_dev).round(round_num)))
print(root_mean_square_error(grid_search.predict(X_dev), np.log(y_dev).round(round_num)))
pred = grid_search.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

{'predict__alpha': 10.0, 'predict__fit_intercept': False, 'predict__random_state': 1, 'predict__solver': 'cholesky', 'preprocess__reduction__n_components': 400, 'preprocess__union__categorical__vectorize__n_features': 2000, 'preprocess__union__numeric__impute__strategy': 'mean'}




0.9132593229529966
0.00994574836147824


In [137]:
make_csv('logPrice_ridge.csv', np.exp(pred))

In [None]:
### ERROR ANALYSIS

In [50]:
df = np.log(y_dev).reset_index()
df['predicted'] = grid_search.predict(X_dev)
df["diff"] = np.abs(df["SalePrice"] - df['predicted'])
df.sort_values(by="diff",ascending=False).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,index,SalePrice,predicted,diff
231,632,11.320554,12.058541,0.737987
78,970,11.81303,11.251613,0.561417
70,1432,11.074421,11.542148,0.467728
252,874,11.104957,11.471947,0.36699
215,885,12.703509,12.387587,0.315922


In [83]:
dat = X_dev.reset_index()[(X_dev.index==632)|(X_dev.index==970)|(X_dev.index==1432)|(X_dev.index==874)|(X_dev.index==885)]
dat


Unnamed: 0,index,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
78,970,50,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,12,2006,WD,Abnorml
215,885,120,FV,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Somerst,...,65,0,0,0,0,0,1,2008,CWD,Abnorml
231,632,20,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,...,0,0,0,0,0,0,4,2009,WD,Family
252,874,50,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,0,0,0,0,0,0,8,2009,WD,Abnorml


In [87]:
full_train.MoSold.value_counts()

6     253
7     234
5     204
4     141
8     122
3     106
10     89
11     79
9      63
12     59
1      58
2      52
Name: MoSold, dtype: int64

In [62]:
y_dev.reset_index()[(y_dev.index==632)|(y_dev.index==970)|(y_dev.index==1432)|(y_dev.index==252)]

Unnamed: 0,index,SalePrice
70,1432,64500
78,970,135000
87,252,173000
231,632,82500


In [33]:
# Gradient Boosting

gb = GradientBoostingRegressor(subsample=.7, max_depth=6, learning_rate = .05, 
                               n_estimators=500, max_features='auto')

search_params = {
    'preprocess__union__categorical__vectorize__n_features': [2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [100],
    'predict__n_estimators': [1000],
    'predict__max_depth': [6],
    'predict__learning_rate': [0.01],
    'predict__subsample': [0.7],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', gb)
])

grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train).round(round_num))
print grid_search.best_params_
print grid_search.score(X_dev, np.log(y_dev).round(round_num))
print root_mean_square_error(grid_search.predict(X_dev), np.log(y_dev).round(round_num))
gb_pred = grid_search.predict(X_test)

  """


{'predict__max_depth': 6, 'predict__n_estimators': 1000, 'preprocess__reduction__n_components': 100, 'predict__subsample': 0.7, 'preprocess__union__categorical__vectorize__n_features': 2000, 'preprocess__union__numeric__impute__strategy': 'mean', 'predict__learning_rate': 0.01}
0.8670425183204485
0.012181228207224424


In [94]:
import xgboost

xgb = XGBRegressor(objective='reg:linear')

search_params = {
    'preprocess__union__categorical__vectorize__n_features': [2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [100],
    'predict__n_estimators': [1000],
    'predict__max_depth': [6],
    'predict__learning_rate': [0.01],
    'predict__subsample': [0.7],
    'predict__objective': ['reg:linear'],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', xgb)
])

grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train).round(round_num))
print(grid_search.best_params_)
print(grid_search.score(X_dev, np.log(y_dev).round(round_num)))
print(root_mean_square_error(grid_search.predict(X_dev), np.log(y_dev).round(round_num)))
xgb_pred = grid_search.predict(X_test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

{'predict__learning_rate': 0.01, 'predict__max_depth': 6, 'predict__n_estimators': 1000, 'predict__objective': 'reg:linear', 'predict__subsample': 0.7, 'preprocess__reduction__n_components': 100, 'preprocess__union__categorical__vectorize__n_features': 2000, 'preprocess__union__numeric__impute__strategy': 'mean'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


0.8714921524492859
0.01201265774985521


In [134]:
np.exp((np.reshape(pred,(1459,))+xgb_pred)/2)

array([117541.24914814, 164334.42035963, 184211.86147573, ...,
       147221.095183  , 112900.56416834, 223065.25076742])

In [138]:
make_csv('logPrice_ridge+XGBTest.csv',np.exp((np.reshape(pred,(1459,))+xgb_pred)/2))

In [266]:
make_csv('logPrice_XGBTest.csv', np.exp(xgb_pred))

In [30]:
# Neural Networks

mlp = MLPRegressor()
search_params = {
    'preprocess__union__categorical__vectorize__n_features': [2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [1000],
    'predict__learning_rate': ['constant'],
    'predict__random_state': [11],
    'predict__hidden_layer_sizes': [3],
    'predict__learning_rate': ['constant'],
    'predict__learning_rate_init': [.001],
    'predict__max_iter': [90],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', mlp)
])

grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train))
print(grid_search.best_params_)
print(grid_search.score(X_dev, np.log(y_dev).round(round_num)))
mlp_pred = grid_search.predict(X_test)

  """


{'preprocess__union__categorical__vectorize__n_features': 2000, 'predict__random_state': 11, 'predict__hidden_layer_sizes': 3, 'preprocess__reduction__n_components': 1000, 'preprocess__union__numeric__impute__strategy': 'mean', 'predict__learning_rate_init': 0.001, 'predict__max_iter': 90, 'predict__learning_rate': 'constant'}
0.8478798174800385


In [31]:
make_csv('logPrice_MLP.csv', np.exp(mlp_pred))

  """
