In [1]:
#from pandas_summary import DataFrameSummary
#from Ipython.Display import display

%load_ext autoreload
%autoreload 2
%matplotlib inline

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os 
import pandas as pd
import numpy as np
import re
import os
from pandas.api.types import is_string_dtype, is_numeric_dtype
import pyarrow
from math import sqrt
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)

In [2]:
df_raw = pd.read_csv("Train.csv", low_memory = False, parse_dates = ["saledate"])

In [3]:
df_raw.head().transpose()

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,66000,57000,10000,38500,11000
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3,3,3,3,3
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68,4640,2838,3486,722
UsageBand,Low,Low,High,High,Medium
saledate,2006-11-16 00:00:00,2004-03-26 00:00:00,2004-02-26 00:00:00,2011-05-19 00:00:00,2009-07-23 00:00:00


In [4]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

In [4]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):	
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [5]:
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [7]:
add_datepart(df_raw, 'saledate')

In [8]:
train_cats(df_raw)

In [9]:
df_raw.isnull().sum().sort_index()/len(df_raw)

Backhoe_Mounting            0.803872
Blade_Extension             0.937129
Blade_Type                  0.800977
Blade_Width                 0.937129
Coupler                     0.466620
Coupler_System              0.891660
Differential_Type           0.826959
Drive_System                0.739829
Enclosure                   0.000810
Enclosure_Type              0.937129
Engine_Horsepower           0.937129
Forks                       0.521154
Grouser_Tracks              0.891899
Grouser_Type                0.752813
Hydraulics                  0.200823
Hydraulics_Flow             0.891899
MachineHoursCurrentMeter    0.644089
MachineID                   0.000000
ModelID                     0.000000
Pad_Type                    0.802720
Pattern_Changer             0.752651
ProductGroup                0.000000
ProductGroupDesc            0.000000
ProductSize                 0.525460
Pushblock                   0.937129
Ride_Control                0.629527
Ripper                      0.740388
S

In [3]:
os.makedirs('tmp', exist_ok = True)
#save to quick format
df_raw.to_feather('tmp/raw')


NameError: name 'df_raw' is not defined

In [8]:
##read in the fast dataframe
df_raw = pd.read_feather('tmp/raw')

In [9]:
def proc_df(df, y_fld, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper

In [10]:
df, y, nas= proc_df(df_raw, 'SalePrice')

In [11]:
m = RandomForestRegressor(n_jobs = -1, min_samples_leaf=3,n_estimators=40)
m.fit(df,y)
m.score(df,y)

0.9726711896113837

In [12]:
def rmse(x,y): return sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train),y_train),rmse(m.predict(X_valid),y_valid),
          m.score(X_train,y_train),m.score(X_valid,y_valid)]
    if hasattr(m,'oob_score_'): res.append(m.oob_score_)
    print(res)

In [13]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 12000 #same as kaggle
n_trn = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw,n_trn)
X_train, X_valid = split_vals(df,n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape,y_train.shape,X_valid.shape




((389125, 66), (389125,), (12000, 66))

In [14]:
print_score(m)

[0.1145314162726861, 0.1189248287603722, 0.9725852627157726, 0.9747423440307984]


In [None]:
## Subset data for quicker compute

In [None]:
#df_trn, y_trn, nas= proc_df(df_raw, 'SalePrice', subset = 30000)
#X_trn, _ = split_vals(df_trn, 20000)


In [31]:
def feature_importance(m, df):
    return pd.DataFrame({'Feature':df.columns, 'Importance':m.feature_importances_}
                       ).sort_values('Importance', ascending=False)

feature_importance(m,df)

Unnamed: 0,Feature,Importance
37,Coupler_System,0.207001
5,YearMade,0.175481
13,ProductSize,0.108571
14,fiProductClassDesc,0.107038
2,ModelID,0.093234
63,saleElapsed,0.077853
19,Enclosure,0.037821
8,fiModelDesc,0.030444
10,fiSecondaryDesc,0.027085
0,SalesID,0.02339
