# Big Iron

Go to kaggle and search for 'Bluebook for Bulldozers' competition

In [72]:
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

# Get the data

In [73]:
#if you have the Kaggle api installed then use it
# !kaggle competitions download -c bluebook-for-bulldozers

In [74]:
#uhoh...my key is exposed
# !chmod 600 /home/keith/.kaggle/kaggle.json

# EDA

In [75]:
df=pd.read_csv('./data/TrainAndValid.csv', low_memory=False)
print(df.shape)

(412698, 53)


In [76]:
df.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000.0,999089,3157,121,3.0,2004,68.0,Low,11/16/2006 0:00,...,,,,,,,,,Standard,Conventional
1,1139248,57000.0,117657,77,121,3.0,1996,4640.0,Low,3/26/2004 0:00,...,,,,,,,,,Standard,Conventional
2,1139249,10000.0,434808,7009,121,3.0,2001,2838.0,High,2/26/2004 0:00,...,,,,,,,,,,
3,1139251,38500.0,1026470,332,121,3.0,2001,3486.0,High,5/19/2011 0:00,...,,,,,,,,,,
4,1139253,11000.0,1057373,17311,121,3.0,2007,722.0,Medium,7/23/2009 0:00,...,,,,,,,,,,


### Note the sale date, this is a time series problem, that changes how we split the data into train and validation set (cannot use random splitter)

To split: Older data is training data, Newer data is validation data.  This keeps information from leaking from the validation set back to the training set.

In [77]:
df.columns

Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'saledate', 'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc',
       'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls'],
      dtype='object')

### Lets see what the date range is

In [78]:
df['saledate']=pd.to_datetime(df['saledate'])

In [79]:
print(f'earliest date= {df.saledate.min()}, latest date={df.saledate.max()}')

earliest date= 1989-01-17 00:00:00, latest date=2012-04-28 00:00:00


### Sort the dataframe by saledate in prearation for splitting into train and validation sets

In [83]:
df.sort_values(by=['saledate'], inplace=True)

### Split Dates into more useful features

Its hard for a random forest to use a datetime object since it has a lot of encoded information (the year,the month, the day of the week, weekday, weekend, holiday, end of quarter etc.).  We can slog through and manually create these features, or use fastai, a library that already does this.

<mark>Fastai is a set of <a href="https://www.fast.ai/posts/2022-07-21-dl-coders-22.html">online courses</a> and <a href="https://github.com/fastai/fastbook">book</a> and library (see below) designed to show regular people how to apply advanced machine and deep learning algorithms to real world problems.  Its extremely applied and fortunately, much of the complexity is handled by the library itself.  It is the best course I know of for teaching AI, Machine Learning and Data Science.

In [67]:
#install fastai library
# !pip install fastai

In [86]:
#using a fastai helper function to get ALL the date info
from fastai.tabular import core
df = core.add_datepart(df, 'saledate')

In [87]:
df.columns

Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc', 'fiModelSeries',
       'fiModelDescriptor', 'ProductSize', 'fiProductClassDesc', 'state',
       'ProductGroup', 'ProductGroupDesc', 'Drive_System', 'Enclosure',
       'Forks', 'Pad_Type', 'Ride_Control', 'Stick', 'Transmission',
       'Turbocharged', 'Blade_Extension', 'Blade_Width', 'Enclosure_Type',
       'Engine_Horsepower', 'Hydraulics', 'Pushblock', 'Ripper', 'Scarifier',
       'Tip_Control', 'Tire_Size', 'Coupler', 'Coupler_System',
       'Grouser_Tracks', 'Hydraulics_Flow', 'Track_Type',
       'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb', 'Pattern_Changer',
       'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type', 'Travel_Controls',
       'Differential_Type', 'Steering_Controls', 'saleYear', 'saleMonth',
       'saleWeek', 'saleDay', 'saleDayofweek', 'saleDayofyear',


In [88]:
#look at all those additional sale columns
[col for col in df.columns if 'sale' in col]

['saleYear',
 'saleMonth',
 'saleWeek',
 'saleDay',
 'saleDayofweek',
 'saleDayofyear',
 'saleIs_month_end',
 'saleIs_month_start',
 'saleIs_quarter_end',
 'saleIs_quarter_start',
 'saleIs_year_end',
 'saleIs_year_start',
 'saleElapsed']

### Handle catagorical variables

### Handle Nulls

What if NaN means that that particular bit of equipment isn't present?

In [19]:
df.isnull().sum()

SalesID                          0
SalePrice                        0
MachineID                        0
ModelID                          0
datasource                       0
auctioneerID                 20136
YearMade                         0
MachineHoursCurrentMeter    265194
UsageBand                   339028
saledate                         0
fiModelDesc                      0
fiBaseModel                      0
fiSecondaryDesc             140727
fiModelSeries               354031
fiModelDescriptor           337882
ProductSize                 216605
fiProductClassDesc               0
state                            0
ProductGroup                     0
ProductGroupDesc                 0
Drive_System                305611
Enclosure                      334
Forks                       214983
Pad_Type                    331602
Ride_Control                259970
Stick                       331602
Transmission                224691
Turbocharged                331602
Blade_Extension     

### independant and dependant vars

1. What are we trying to predict? (SalePrice)
2. The rest of the columns are independant or predictor variables


## Split the dataset