In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

from IPython.display import display

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import add_dummy_feature, Normalizer, PolynomialFeatures, StandardScaler,\
                                    OneHotEncoder, LabelEncoder, Imputer, LabelBinarizer

from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import AgglomerativeClustering

from sklearn.externals import joblib

import matplotlib.pyplot as plt
import seaborn as sns

import functions

import gc

In [None]:
train_data = pd.read_csv('D:/Kaggle/zillow data/train_2016_v2.csv')
properties_to_score = pd.read_csv('D:/Kaggle/zillow data/properties_2016.csv', low_memory=False)
df_properties = pd.merge(train_data, properties_to_score, on='parcelid', how='left')

del train_data
gc.collect()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
class myLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, prefix_sep='_', sparse_output=False):
        self.prefix_sep = prefix_sep
        self.sparse_output = sparse_output
    
    def fit(self, X):
        self.lbDict = {}
        self.classes_ = []
        for var in X:
            lb = LabelBinarizer(sparse_output=self.sparse_output)
            lb.fit(X[var].fillna('nan').astype('str'))
            self.lbDict[var] = lb
            classes = [var+self.prefix_sep+x for x in lb.classes_]
            self.classes_ += classes
        return self
    
    def transform(self, X):
        transformed = np.empty((X.shape[0], 1))
        for var, lb in self.lbDict.items():
            transformedX = lb.transform(X[var].fillna('nan').astype('str'))
            print(var, transformedX.shape[1])
            transformed = np.append(transformed, transformedX, axis=1)
        return transformed

In [None]:
class df_data():
    def __init__(self, df):
        self.columns = df.columns
        self.vars = {
            'cat_vars': [
                'airconditioningtypeid',
                'architecturalstyletypeid',
                'bathroomcnt',
                'bedroomcnt',
                'buildingclasstypeid',
                'buildingqualitytypeid',
                'calculatedbathnbr',
                'decktypeid',
                'fireplacecnt',
                'fullbathcnt',
                'garagecarcnt',
                'fips',
                'hashottuborspa',
                'heatingorsystemtypeid',
                'poolcnt',
                'pooltypeid10',
                'pooltypeid2',
                'pooltypeid7',
                'propertycountylandusecode',
                'propertylandusetypeid',
                'propertyzoningdesc',
                'regionidcity',
                'regionidzip',
                'regionidneighborhood',
                'regionidcounty',
                'roomcnt',
                'storytypeid',
                'threequarterbathnbr',
                'typeconstructiontypeid',
                'unitcnt',
                'yearbuilt',
                'numberofstories',
                'fireplaceflag',
                'assessmentyear',
                'taxdelinquencyflag',
                'taxdelinquencyyear'
                
            ],
            'num_var': [
                'basementsqft',
                'finishedfloor1squarefeet',
                'calculatedfinishedsquarefeet',
                'finishedsquarefeet12',
                'finishedsquarefeet13',
                'finishedsquarefeet15',
                'finishedsquarefeet50',
                'finishedsquarefeet6',
                'garagetotalsqft',
                'lotsizesquarefeet',
                'poolsizesum',
                'rawcensustractandblock',
                'yardbuildingsqft17',
                'yardbuildingsqft26',
                'structuretaxvaluedollarcnt',
                'taxvaluedollarcnt',
                'landtaxvaluedollarcnt',
                'taxamount',
                'censustractandblock'
            ],
            'location': [
                'latitude',
                'longitude'
            ],
            'dep_var': 'logerror'}

In [None]:
df_data = df_data(df_properties)

# Categorical Vars

## Add Dummy Variables

In [None]:
transformed = pd.get_dummies(df_properties[df_data.vars['cat_vars']], sparse=True)
transformed.head()

## Add Additional Categorical Variables

In [None]:
engineered_features = pd.DataFrame([])

# Numerical Vars

## Feature Engineering

In [None]:
numerical_nulls = df_properties[df_data.vars['num_var']].isnull()*1
numerical_nulls_cols = [col+'_nan' for col in  numerical_nulls.columns]
engineered_features[numerical_nulls_cols] = numerical_nulls

del numerical_nulls
del numerical_nulls_cols
gc.collect()

In [None]:
df_properties['location'] = df_properties['latitude'] + df_properties['longitude']
df_properties['location2'] = df_properties['latitude']*df_properties['longitude']

df_properties['diffFinUnfin'] = df_properties['finishedsquarefeet12'] - df_properties['finishedsquarefeet6']
df_properties['totalRooms'] = df_properties['bathroomcnt'] + df_properties['bedroomcnt']
df_properties['finishedsquarefeetError'] = df_properties['calculatedfinishedsquarefeet'] - df_properties['finishedsquarefeet12']
df_properties['finishedsquarefeetError'] = abs(df_properties['calculatedfinishedsquarefeet'] - df_properties['finishedsquarefeet12'])
df_properties['avgUnitSizeCalc'] = df_properties['unitcnt'] / df_properties['calculatedfinishedsquarefeet']
df_properties['avgUnitSize'] = df_properties['unitcnt'] / df_properties['finishedsquarefeet12']
df_properties['tanfinishedsqarefeetError'] = np.tanh(df_properties['calculatedfinishedsquarefeet']/ df_properties['finishedsquarefeet12'])

# Merge Data

In [None]:
df_modeling = np.hstack((transformed, engineered_features, df_properties[df_data.vars['num_var']]))

## Imputing

In [None]:
imputer = Imputer(strategy='median')

In [None]:
df_train, df_test, df_y_train, df_y_test = train_test_split(df_modeling, df_properties[['parcelid','logerror']], test_size=0.2)

In [None]:
df_train = imputer.fit_transform(df_train)
df_test = imputer.transform(df_test)

In [None]:
del df_modeling
gc.collect()

# Model

## RF

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold()

In [None]:
rf = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=8, n_jobs=-1)
rf.fit(df_train, df_y_train.iloc[:,1])

In [None]:
print('Train', mean_absolute_error(df_y_train.iloc[:,1], rf.predict(df_train)))
print('Test', mean_absolute_error(df_y_test.iloc[:,1], rf.predict(df_test)))

In [None]:
allVariables = pd.concat([transformed, df_properties[df_data.vars['num_var']]], axis=1).columns

In [None]:
pd.DataFrame(rf.feature_importances_, columns=['variables'], index=allVariables).sort_values('variables', ascending=False).head(20).plot(kind='barh', figsize=(13, 7.5))

## GBT

In [None]:
gb = GradientBoostingRegressor(n_estimators=10, max_depth=5, loss='lad', learning_rate=0.1)

gb.fit(df_train, df_y_train['logerror'])

In [None]:
print('Train ', mean_absolute_error(df_y_train['logerror'], gb.predict(df_train)))
print('Test ', mean_absolute_error(df_y_test['logerror'], gb.predict(df_test)))

In [None]:
allVariables = pd.concat([transformed, engineered_features, df_properties[df_data.vars['num_var']]], axis=1).columns
feature_importance = pd.DataFrame(gb.feature_importances_, columns=['variables'], index=allVariables).sort_values('variables', ascending=False)
feature_importance\
    .head(20)\
    .plot(kind='barh', figsize=(13, 7.5))

# Model with top 20 features

In [None]:
feature_importance.head(20).index

In [None]:
df_modeling = pd.concat([transformed, engineered_features, df_properties[df_data.vars['num_var']]], axis=1)[feature_importance.head(20).index]

## Imputing

In [None]:
imputer = Imputer(strategy='median')

In [None]:
df_train, df_test, df_y_train, df_y_test = train_test_split(df_modeling, df_properties[['parcelid','logerror']], test_size=0.2)

In [None]:
df_train = imputer.fit_transform(df_train)
df_test = imputer.transform(df_test)

In [None]:
del df_modeling
gc.collect()

In [None]:
gb = GradientBoostingRegressor(n_estimators=500, max_depth=8, loss='lad', learning_rate=0.01)

gb.fit(df_train, df_y_train['logerror'])

In [None]:
print('Train ', mean_absolute_error(df_y_train['logerror'], gb.predict(df_train)))
print('Test ', mean_absolute_error(df_y_test['logerror'], gb.predict(df_test)))

In [None]:
print('Train ', mean_absolute_error(df_y_train['logerror'], gb.predict(df_train)))
print('Test ', mean_absolute_error(df_y_test['logerror'], gb.predict(df_test)))

In [None]:
print('Train ', mean_absolute_error(df_y_train['logerror'], gb.predict(df_train)))
print('Test ', mean_absolute_error(df_y_test['logerror'], gb.predict(df_test)))

In [None]:
(df_y_test['logerror'] - gb.predict(df_test)).plot(kind='hist', bins=25, log=True)

In [None]:
allVariables = feature_importance.head(20).index
feature_importance = pd.DataFrame(gb.feature_importances_, columns=['variables'], index=allVariables).sort_values('variables', ascending=False)
feature_importance\
    .plot(kind='barh', figsize=(13, 7.5))

### Save

In [None]:
joblib.dump(gb, 'gb_model.pkl')

In [None]:
gb = joblib.load('gb_model.pkl')

## Final Model

# GridSearch

In [None]:
from sklearn.metrics import make_scorer

In [None]:
mae = make_scorer(mean_absolute_error, greater_is_better=False)
param_grid = {'n_estimators': [8],
              'max_depth': [5, 10]}
gs = GridSearchCV(RandomForestRegressor(n_jobs=-1), param_grid=param_grid, cv=3, verbose=10, scoring=mae, n_jobs=2)

In [None]:
gs.fit(df_train.toarray(), df_y_train['logerror'])

In [None]:
gs.best_estimator_

## Final Scoring

In [None]:
def scoreData():
    final_scores = pd.DataFrame()
    chunk_size = 10000
    print('Begin')
    for chunk in pd.read_csv('D:/Kaggle/zillow data/properties_2016.csv', chunksize=chunk_size, low_memory=False):
        # Prepare chunk for scoring
#         cat_vars = lb.transform(chunk[df_data.vars['cat_vars']])
        cat_vars = pd.get_dummies(chunk[df_data.vars['cat_vars']])
        cat_vars = cat_vars.reindex(columns=transformed.columns, fill_value=0)
        print(chunk[df_data.vars['cat_vars']].shape)
        print(cat_vars.shape)
        print(chunk[df_data.vars['num_var']].shape)
        all_vars = imputer.transform(np.append(np.array(cat_vars), chunk[df_data.vars['num_var']], axis=1))
        print(all_vars.shape)
        del cat_vars
        gc.collect()
    #     # Score chunk
        scores = gb.predict(all_vars)
        final_scores = final_scores.append([chunk[df_data.vars['dep_var']], scores])
    return final_scores

In [None]:
def scoreData():
    final_scores = []
    final_id = []
    chunk_size = 500000
    print('Begin')
    for chunk in pd.read_csv('D:/Kaggle/zillow data/properties_2016.csv', chunksize=chunk_size, low_memory=False):
        # Prepare chunk for scoring
#         cat_vars = lb.transform(chunk[df_data.vars['cat_vars']])
        cat_vars = pd.get_dummies(chunk[[var for var in df_data.vars['cat_vars'] if var in feature_importance.head(20).index]])
        cat_vars = cat_vars.reindex(columns=[var for var in transformed.columns if var in feature_importance.head(20).index], fill_value=0)

        all_vars = imputer.transform(np.append(np.array(cat_vars), chunk[[var for var in df_data.vars['num_var'] if var in feature_importance.head(20).index]], axis=1))
        print(all_vars.shape)
        del cat_vars
        gc.collect()
        
    #     # Score chunk
        print('Scoring....')
        scores = gb.predict(all_vars)
        final_scores.append( scores)
        final_id.append(chunk['parcelid'])
    return final_scores, final_id

In [None]:
final_scores, final_id = scoreData()

In [None]:
final_scores = np.hstack(final_scores)
final_id = np.hstack(final_id)

In [None]:
col_header = ['ParcelId','201610','201611','201612','201710','201711','201712']

output = pd.DataFrame([final_id, final_scores, final_scores, final_scores,
                       final_scores, final_scores, final_scores], index=col_header).T

output['ParcelId'] = output['ParcelId'].apply(lambda x: int(x))

In [None]:
output.to_csv('test4.csv', index=False)