In [45]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor 
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

In [2]:
train = pd.read_csv("./data/Train.csv", low_memory = False, parse_dates = ["saledate"])

In [129]:
train.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,sale_year,sale_month,sale_day,sale_timestamp
0,1139246,11.09741,999089,3157,121,3.0,2004,68.0,Low,521D,...,,,,,Standard,Conventional,2006,11,16,1163635200
1,1139248,10.950807,117657,77,121,3.0,1996,4640.0,Low,950FII,...,,,,,Standard,Conventional,2004,3,26,1080259200
2,1139249,9.21034,434808,7009,121,3.0,2001,2838.0,High,226,...,,,,,,,2004,2,26,1077753600
3,1139251,10.558414,1026470,332,121,3.0,2001,3486.0,High,PC120-6E,...,,,,,,,2011,5,19,1305763200
4,1139253,9.305651,1057373,17311,121,3.0,2007,722.0,Medium,S175,...,,,,,,,2009,7,23,1248307200


In [7]:
train.SalePrice = np.log(train.SalePrice)

In [125]:
def split_date(data_frame, col_name):
    ''' helper function to split the date into corresponding categoricals '''
    
    # split date into corresponding columns
    split_date.props = ["year", "month", "day"]
    
    # extract date column
    col_date = data_frame[col_name]
    
    # extracting date properties and storing into individual columns
    for d in split_date.props:
        if(hasattr(col_date.dt, d)):
            data_frame[f"sale_{d}"] = getattr(col_date.dt, d)
            
    # extracting timestamp
    data_frame["sale_timestamp"] = col_date.astype(np.int64) // (10 ** 9)
    
    # removing the raw column
    data_frame.drop(columns = [col_name], inplace = True)
    
    
def trans_categorical(data_frame):
    ''' helper function to tranform text-based columns to numerical(categorical ones)'''
    
    # loop through each col
    for label, column in data_frame.items():
        
        # if string convert as category
        if(is_string_dtype(data_frame[label])):
            data_frame[label] = data_frame[label].astype("category")
            
            
def trans_numerical(data_frame, target):
    ''' helper function for retrieving numerical features and targets, 
    null values are transformed into the median value of the column. '''

    # copy the data frame
    data_frame_c = data_frame.copy()

    # loop through each col and normalize
    for label, column in data_frame_c.items():
        
        # check if numerical and contain nulls
        if(is_numeric_dtype(column) and pd.isnull(column).sum() != 0):
            
            # column median
            median = column.median()
            
            # fill the data with the median
            data_frame_c[label] = column.fillna(median)
            
    # loop through each col and change to numerical
    for label, column in data_frame_c.items(): 
        
        # if numerical nothing to do
        if(not is_numeric_dtype(column)):
            
            # change to numerical data
            data_frame_c[label] = data_frame_c[label].astype("category").cat.codes + 1
    
    return [ data_frame_c.drop(columns = [ target ]), data_frame_c[target].values ]

In [27]:
split_date(train, "saledate")

In [52]:
trans_categorical(train)

In [58]:
is_categorical_dtype(train["ProductSize"])

train.ProductSize.cat.categories

Index(['Compact', 'Large', 'Large / Medium', 'Medium', 'Mini', 'Small'], dtype='object')

In [84]:
train.ProductSize.cat.codes.head()

0   -1
1    3
2   -1
3    5
4   -1
dtype: int8

In [83]:
train.isnull().sum().head()

SalesID       0
SalePrice     0
MachineID     0
ModelID       0
datasource    0
dtype: int64

In [126]:
features, targets = trans_numerical(train, "SalePrice")

In [128]:
regressor = RandomForestRegressor()
regressor.fit(features, targets)
regressor.score(features, targets)

0.9830852858760372