In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np
import scipy as sp

from IPython.display import display

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import add_dummy_feature, Normalizer, PolynomialFeatures, StandardScaler, OneHotEncoder, LabelEncoder, Imputer

from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import AgglomerativeClustering

import matplotlib.pyplot as plt
import seaborn as sns

import gc

In [4]:
train = pd.read_csv('D:/Kaggle/zillow data/train_2016_v2.csv')
properties = pd.read_csv('D:/Kaggle/zillow data/properties_2016.csv', low_memory=False)
df_properties = pd.merge(train, properties, on='parcelid', how='left')

del train; gc.collect()
# del properties; gc.collect()

7

In [5]:
class dataObject():
    def __init__(self, df):
        self.columns = df.columns
        self.vars = {
            'cat_vars': [
                'airconditioningtypeid',
                'architecturalstyletypeid',
                'bathroomcnt',
                'bedroomcnt',
                'buildingclasstypeid',
                'buildingqualitytypeid',
                'calculatedbathnbr',
                'decktypeid',
                'fireplacecnt',
                'fullbathcnt',
                'garagecarcnt',
                'fips',
                'hashottuborspa',
                'heatingorsystemtypeid',
                'poolcnt',
                'pooltypeid10',
                'pooltypeid2',
                'pooltypeid7',
                'propertycountylandusecode',
                'propertylandusetypeid',
                'propertyzoningdesc',
                'regionidcity',
                'regionidzip',
                'regionidneighborhood',
                'regionidcounty',
                'roomcnt',
                'storytypeid',
                'threequarterbathnbr',
                'typeconstructiontypeid',
                'unitcnt',
                'yearbuilt',
                'numberofstories',
                'fireplaceflag',
                'assessmentyear',
                'taxdelinquencyflag',
                'taxdelinquencyyear'
                
            ],
            'num_var': [
                'basementsqft',
                'finishedfloor1squarefeet',
                'calculatedfinishedsquarefeet',
                'finishedsquarefeet12',
                'finishedsquarefeet13',
                'finishedsquarefeet15',
                'finishedsquarefeet50',
                'finishedsquarefeet6',
                'garagetotalsqft',
                'lotsizesquarefeet',
                'poolsizesum',
                'rawcensustractandblock',
                'yardbuildingsqft17',
                'yardbuildingsqft26',
                'structuretaxvaluedollarcnt',
                'taxvaluedollarcnt',
                'landtaxvaluedollarcnt',
                'taxamount',
                'censustractandblock'
            ],
            'location': [
                'latitude',
                'longitude'
            ],
            'dep_var': 'logerror'}

In [6]:
df_data = dataObject(df_properties)

# Variables Cat/ Num

In [7]:
impute = Imputer(strategy='median')
onehotencoder = OneHotEncoder()

In [8]:
df_modeling = df_properties

In [9]:
mapper={}
df=pd.DataFrame(columns=df_modeling[df_data.vars['cat_vars']].columns)
for col in df_modeling[df_data.vars['cat_vars']]:
    mapper[col] = LabelEncoder()
    df[col] = mapper[col].fit_transform(df_modeling[col].apply(lambda x: str(x)))

In [10]:
dummies = pd.get_dummies(df_modeling[df_data.vars['cat_vars']])

In [11]:
train_dummy_cols = dummies.columns

In [12]:
df = onehotencoder.fit_transform(df)

In [13]:
df_cat_train, df_cat_test, df_num_train, df_num_test, df_y_train, df_y_test = train_test_split(df, df_modeling[df_data.vars['num_var']], df_modeling[['parcelid','logerror']], train_size=0.8)

In [14]:
df_num_train = impute.fit_transform(df_num_train)
df_num_test = impute.transform(df_num_test)

In [15]:
df_train = sp.sparse.hstack((df_num_train, df_cat_train))
df_test = sp.sparse.hstack((df_num_test, df_cat_test))

# Model

## RF

In [20]:
rf = RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1)

In [21]:
rf.fit(df_train, df_y_train['logerror'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [22]:
print('Train ', mean_absolute_error(df_y_train['logerror'], rf.predict(df_train)))

Train  0.0676099095336


In [24]:
print('Test ', mean_absolute_error(df_y_test['logerror'], rf.predict(df_test)))

Test  0.0674274689077


## GBT

In [16]:
gb = GradientBoostingRegressor()

In [17]:
gb.fit(df_train, df_y_train['logerror'])

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [18]:
print('Train ', mean_absolute_error(df_y_train['logerror'], gb.predict(df_train.toarray())))

Train  0.0678087949836


In [19]:
print('Test ', mean_absolute_error(df_y_test['logerror'], gb.predict(df_test.toarray())))

Test  0.0670288422892


## Final Scoring

In [25]:
properties = pd.read_csv('D:/Kaggle/zillow data/properties_2016.csv', low_memory=False)

In [39]:
test_dummy = pd.get_dummies(properties[df_data.vars['cat_vars']], sparse=True)

In [40]:
test_dummy = pd.SparseDataFrame(test_dummy[train_dummy_cols])

MemoryError: 

In [134]:
final_dummies = properties[df_data.vars['num_var']].isnull()*1.0
final_num = impute.transform(properties[df_data.vars['num_var']])
final = np.concatenate([final_num, final_dummies], axis=1)

In [137]:
col_header = ['ParcelId','201610','201611','201612','201710','201711','201712']

output = pd.DataFrame([properties['parcelid'], final_predictions, final_predictions, final_predictions,
                       final_predictions, final_predictions, final_predictions], index=col_header).T

output['ParcelId'] = output['ParcelId'].apply(lambda x: int(x))

In [154]:
output.to_csv('test2.csv', index=False)