In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import operator
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import random
import datetime as dt

def K_NN_Residuals(data, x,k = 5,n1 = 'geocode_latitude',n2 = 'geocode_longitude',target = 'res'):
    tmp_arr = data[[n1,n2]]-np.array([x[n1],x[n2]])
    tmp_arr = np.square(tmp_arr).sum(axis = 1)
    ranks = tmp_arr.rank()
    return np.sum(data[ranks<=k])[0]


def model_fill(data,model,target):
    
    truth_val = pd.isna(data[target])
    test = data[truth_val]
    train = data[~truth_val]
    if test.shape[0] == 0 or train.shape[0] == 0:
        return data
    train = train.fillna(0)
    test  = test.fillna(0)
    model.fit(train.drop(target,axis = 1),train[target])
    data.loc[truth_val,target] = model.predict(test.drop(target,axis = 1))
    
    return data

def make_one_hot_encoding(data,number_of_top_values,target):
    top_names = list(dict(data[target].value_counts()).keys())[0:number_of_top_values]
    for name in top_names:
        data[name] = 0
        data.loc[data[target] == name, name] = 1
    data.drop([target],axis = 1, inplace = True)
    return data

def change_style_attribute(x):
    if x.isdigit():
        return x + ".0"
    else:
        return x
    
def transform_date(data,target):
    data.loc[:, target + '_int'] = pd.to_datetime(data[target]).dt.strftime("%Y%m%d").astype(int)
    data.drop(target,axis = 1, inplace = True)
    return data

def make_ordinal(data,target,ordered_list):
    dict_ordered = {}
    i = len(ordered_list)
    
    name_mod = target + '_ordinal'
    data[name_mod] = 0
    for name in ordered_list:
        data.loc[data[target] == name, name_mod] = i
        i -= 1
    data.drop(target,axis = 1, inplace = True)
    return data



In [2]:
# Import Data
train = pd.read_csv('../data/datathon_propattributes.csv').sample(frac = 1e-1)

  interactivity=interactivity, compiler=compiler, result=result)


KeyboardInterrupt: 

In [None]:
dropped_cols = ['irregular_lot_flg','prop_house_number_suffix','apn',
                'IsTraining',
                'fips_cd',
                'prop_house_number',
                'prop_house_number_2'  ,
                'prop_house_number_suffix' ,
                'prop_direction_left'  ,
                'prop_street_name'  ,     
                'prop_suffix'  ,
                'prop_direction_right' ,
                'prop_unit_number' ,
                'prop_city' ,
                'prop_state',
                'prop_zip_code' ,
                'prop_zip_plus_4',
               'census_tract',
               'irregular_lot_flg',
               'tax_cd_area',
                'tax_year']

one_hots = [['dwelling_type',10],
            ['prop_unit_type',4], # maybe drop
            ['zoning',15],
            ['roof_type',9],
            ['roof_cover',13],
            ['garage_type',14],
            ['construction_type',13],
            ['basement_cd',7],
            ['style',20],
            ['stories_cd',5],
            ['mobile_home_ind',1],
            ['timeshare_ind',1],
            ['distressed_sale_flg',1]]

date_cols = ['transaction_date']


condition= ['Excellent','Good','Average','Fair','Poor','Unsound']
Construction=['A+','A','A-','B+','B','B-','C+','C','C-','D+','D','D-','E+','E','E-']
Air_conditioning=['Central','Refrigeration','Chilled Water','Geo-Thermal','Packaged Unit','Wall','Window\\Unit','Evaporative Cooler','Yes','Ventilation','Partial','Other','None']
Heating_Type= ['Central','Zone','Geo-thermal','Solar','Forced air unit','Heat Pump','Hot Water','Electric','Steam','Floor/Wall','Space/Suspended','Baseboard','Radiant','Propane','Gas','Oil','Coal','Gravity','Other','Wood Burning','Yes','Vent','None']

ordinals = [['condition',condition],
            ['construction_quality',Construction],
            ['air_conditioning',Air_conditioning],
            ['heating_type',Heating_Type]]


candidates_fill = ['assessed_total_value',
                   'assessed_land_value',
                   'assessed_improvement_value',
                   'market_total_value',
                   'market_land_value',
                   'market_improvement_value',
                   'tax_amt',
                    'avm_final_value0',
                    'avm_std_deviation0',
                    'avm_final_value1',
                    'avm_std_deviation1',
                    'avm_final_value2'   ,            
                    'avm_std_deviation2',
                    'avm_final_value3'   ,            
                    'avm_std_deviation3',
                    'avm_final_value4',
                    'avm_std_deviation4' ]

# Transform attributes
train['style'] = train['style'].astype(str)
train['style'] = train['style'].apply(change_style_attribute)
train['stories_cd'] = train['stories_cd'].astype(str)

##CONVERT STRINGS TO NUMERICAL VARIABLES##
for target_column, number_of_top_names in one_hots:
    make_one_hot_encoding(train,number_of_top_names,target_column)
    
for target_column, ordered_names in ordinals:
    make_ordinal(train,target_column,ordered_names)
    
for target_column in date_cols:
    transform_date(train,target_column)

In [None]:
#train.apply(pd.to_numeric)

##FILL MISSING DATA IN SELECT COLUMNS##
fill_data = train.drop(dropped_cols.extend(['sale_amt','geocode_longitude','geocode_latitude']),axis = 1)
for candidate in candidates_fill:
    model_fill(fill_data, RandomForestRegressor(), candidate)
    
test = train[train['IsTraining'] != 1]
train = train[train['IsTraining'] == 1]
train.drop(dropped_cols,axis =1 , inplace = True)
del fill_data

gc.collect()

In [None]:
# which columns contain NaN's
train.fillna(method='bfill',inplace = True)
print(train.isna().any())


In [6]:
# Model Training



##  XGBoost   ##

x_train_first_stage = train.drop(['sale_amt','geocode_longitude','geocode_latitude'],axis = 1)
first_stage = train[['sale_amt','geocode_longitude','geocode_latitude']]


##### RUN XGBOOST

# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': np.mean(first_stage.sale_amt),
    'silent': 1
}

dtrain_first_stage = xgb.DMatrix(x_train_first_stage, first_stage.sale_amt)
num_boost_rounds = 250
# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain_first_stage, num_boost_round=num_boost_rounds)

first_stage['pred'] = model.predict(dtrain_first_stage)
first_stage['res'] = first_stage['pred'] - first_stage['sale_amt']

demo_sample = first_stage.sample(frac = 1e-3)
apply_knn = lambda x : K_NN_Residuals(demo_sample,x, k = 5)
train['KNN'] = first_stage.apply(apply_knn,axis = 1)
test['KNN'] = test.apply(apply_knn,axis = 1)

test.drop(['geocode_longitude','geocode_latitude'],axis = 1,inplace = True)
train.drop(['geocode_longitude','geocode_latitude'],axis = 1,inplace = True)
del first_stage
del dtrain_first_stage
del model
del x_train_first_stage
gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/aaell/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-27f3eeabb4c0>", line 35, in <module>
    train['KNN'] = first_stage.apply(apply_knn,axis = 1)
  File "/Users/aaell/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 6014, in apply
    return op.get_result()
  File "/Users/aaell/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py", line 142, in get_result
    return self.apply_standard()
  File "/Users/aaell/anaconda3/lib/python3.7/site-packages/pandas/core/apply.py", line 242, in apply_standard
    labels=labels)
  File "pandas/_libs/reduction.pyx", line 637, in pandas._libs.reduction.reduce
  File "pandas/_libs/reduction.pyx", line 149, in pandas._libs.reduction.Reducer.get_result
  File "<ipython-input-6-27f3eeabb4c0>", line 34, in <lambda>
    apply_knn = lambda x : K_NN_Residuals(fi

KeyboardInterrupt: 

In [None]:
x_train = train.drop(['sale_amt'],axis = 1)
y_train = train.sale_amt

x_test = test.drop(['sale_amt'],axis = 1)
y_test = test.sale_amt
#Create Location Param#
##### RUN XGBOOST AGAIN

# xgboost params
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': np.mean(y_train),
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

num_boost_rounds = 150

model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
xgb_pred = model.predict(dtest)

In [None]:
def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

print(MAE(y_test, xgb_pred))



In [None]:


##### COMBINE XGBOOST RESULTS
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2

del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2 
gc.collect()


##    OLS     ##

np.random.seed(17)
random.seed(17)

train = pd.read_csv("../input/train_2016_v2.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../input/properties_2016.csv")
submission = pd.read_csv("../input/sample_submission.csv")
print(len(train),len(properties),len(submission))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df



train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]

train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])

reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = [];  y = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

# Parameters
XGB_WEIGHT = 0.6000
BASELINE_WEIGHT = 0.0000
OLS_WEIGHT = 0.0600
XGB1_WEIGHT = 0.8000





##### COMBINE PREDICTIONS

print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
pred0 = xgb_weight0*xgb_pred + baseline_weight0*BASELINE_PRED + lgb_weight0*p_test

print( "\nCombined XGB/LGB/baseline predictions:" )
print( pd.DataFrame(pred0).head() )

print( "\nPredicting with OLS and combining with XGB/LGB/baseline predicitons: ..." )
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)

In [8]:
# Model Inference
train.dtypes

fips_cd                         int64
apn                            object
IsTraining                      int64
prop_house_number              object
prop_house_number_2            object
prop_house_number_suffix      float64
prop_direction_left            object
prop_street_name               object
prop_suffix                    object
prop_direction_right           object
prop_unit_type                 object
prop_unit_number               object
prop_city                      object
prop_state                     object
prop_zip_code                 float64
prop_zip_plus_4               float64
dwelling_type                  object
zoning                         object
census_tract                  float64
mobile_home_ind                object
timeshare_ind                  object
acres                         float64
land_square_footage             int64
irregular_lot_flg             float64
assessed_total_value          float64
assessed_land_value           float64
assessed_imp

In [10]:
# Validation
properties = train.columns
numeric_columns = properties[train.dtypes == 'float64']

In [14]:
# Visualizations
numeric_columns = numeric_train.columns


Index(['fips_cd', 'IsTraining', 'land_square_footage', 'tax_year',
       'delinquent_tax_year', 'assessed_year', 'building_square_feet',
       'total_living_square_feet', 'total_ground_floor_square_feet',
       'total_basement_square_feet', 'total_garage_parking_square_feet',
       'year_built', 'effective_year_built', 'bedrooms', 'total_rooms',
       'total_baths_calculated', 'fireplace_num'],
      dtype='object')

In [14]:
train.distressed_sale_flg.value_counts()

Y    1962567
Name: distressed_sale_flg, dtype: int64

In [7]:
dwelling_type_top_25 = ['Single Family Residential', 'Condominium (Residential)', 'Row house (Residential)', 'Residential-Vacant Land', 'Duplex (2 units, any combination)', 'Triplex (3 units, any combination)', 'Townhouse (Residential)', 'Apartments (generic)', 'Residential (General) (Single)', 'Vacant Land (General)', 'Commercial/Office/Residential Mixed Use', 'Mobile home', 'Multi-Family Dwellings (Generic, 2+)', 'Rural Residence (Agricultural)', 'Commercial (General)', 'Unusable Land (Remnant, Steep, etc.)', 'Retail Stores (Personal Services, Photography, Travel)', 'Commercial-Vacant Land', 'Exempt (full or partial)', 'Office Bldg (General)', 'Agricultural-Unimproved Vacant Land', 'Agricultural / Rural', 'Misc. Structures - Ranch, Farm, Fixtures', 'Warehouse (Industrial)', 'Commercial Building', 'Seasonal, Cabin, Vacation Residence', 'Condominium Offices']

In [75]:
train['KNN']

3703055    2005660.0
1993572    1112910.0
1406605    1112910.0
3317860    2005660.0
3629787    2246660.0
899394      876900.0
1457060    1367967.0
2245357    1367967.0
3225126    1367967.0
2415985    1112910.0
2178668    2225010.0
3232299    1367967.0
1330244    2005660.0
3407980    2005660.0
3984122    1741660.0
2277522    2225010.0
24694       945015.0
1628689     876900.0
578614     2225010.0
223583      945015.0
3030833    2300010.0
2242983    2225010.0
Name: KNN, dtype: float64