# Objective 

<p>
    <span style='font-family:Arial'>
    Predict the sale price of a particular piece of heavy equiment at an auction based on its usage, equipment type, and configuration.
    </span>
</p> 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

# Load the Data

In [None]:
df = pd.read_csv('../input/bluebook-for-bulldozers/TrainAndValid.csv')
df.head()

In [None]:
df.info()

# Change the date format

In [None]:
df = pd.read_csv('../input/bluebook-for-bulldozers/TrainAndValid.csv',low_memory = False,parse_dates = ['saledate'])

In [None]:
df.head().T

In [None]:
df.sort_values(by = ['saledate'],inplace = True,ascending = True)
df.saledate

In [None]:
df['SaleYear'] = df.saledate.dt.year
df['SaleMonth'] = df.saledate.dt.month
df['SaleDay'] = df.saledate.dt.day
df['SaleDayOfWeek'] = df.saledate.dt.dayofweek
df['SaleDayOfyear'] = df.saledate.dt.dayofyear

In [None]:
df.head().T

In [None]:
df = df.drop('saledate',axis = 1)

In [None]:
sns.distplot(df.SalePrice)

In [None]:
df.info()

## Drop the columns with many unique categories

In [None]:
max_cardinality = 100
high_cardinality = [col for col in df.select_dtypes(exclude=np.number)
                   if df[col].nunique() > max_cardinality]
df = df.drop(columns=high_cardinality)
df.info()

In [None]:
corr = df.corr()
corr

# Preprocessing
* Change the data format to numeric type
* Deal with missing values

In [None]:
for label,content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype('category').cat.as_ordered()
        df[label] = pd.Categorical(content).codes+1

In [None]:
df.info()

In [None]:
for label,content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            print(label)

In [None]:
for label,content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            df[label] = content.fillna(content.median())

In [None]:
df.info()

In [None]:
df.isna().sum()

# Split the data

In [None]:
val_df = df[df.SaleYear == 2012]
train_df = df[df.SaleYear != 2012]

In [None]:
x_train = train_df.drop('SalePrice',axis = 1)
y_train = train_df['SalePrice']
x_valid = val_df.drop('SalePrice',axis = 1)
y_valid = val_df['SalePrice']

In [None]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

# Build a scoring function

In [None]:
def rmsle(y_test,y_preds):
    return np.sqrt(mean_squared_log_error(y_test,y_preds))

def scores(model):
    train_preds = model.predict(x_train)
    val_preds = model.predict(x_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(x_train, y_train),
              "Valid R^2": model.score(x_valid, y_valid)}
    return scores

# Model

In [None]:
%%time
model = RandomForestRegressor(n_jobs = -1,
                              random_state = 42)
model.fit(x_train,y_train)

In [None]:
scores(model)

# Hyperparameter tuning with RandomizedSearchCV

In [None]:
%%time

grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10,15,20],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [20000]}

model1 = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions= grid,
                              n_iter=40,
                              cv=5,
                              verbose=True)

model1.fit(x_train, y_train)

In [None]:
scores(model1)

In [None]:
model1.best_params_

# Train model with best parameters

In [None]:
%%time

tuned_model = RandomForestRegressor(n_estimators=60,
                                    min_samples_leaf=1,
                                    min_samples_split=12,
                                    max_features=0.5,
                                    n_jobs=-1)
tuned_model.fit(x_train, y_train)

In [None]:
scores(tuned_model)

# Load the test data set

In [None]:
Test_data = pd.read_csv('../input/bluebook-for-bulldozers/Test.csv',parse_dates = ['saledate'])
Test_data.head()

# Preprocessing
In the same format as training data

In [None]:
def processed_data(df):
    df['SaleYear'] = df.saledate.dt.year
    df['SaleMonth'] = df.saledate.dt.month
    df["SaleDay"] = df.saledate.dt.day
    df["SaleDayOfWeek"] = df.saledate.dt.dayofweek
    df["SaleDayOfyear"] = df.saledate.dt.dayofyear
    
    df.drop("saledate", axis=1, inplace=True)
    
    for label,content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isna(content).sum():
                df[label] = content.fillna(content.median())
                
        if not pd.api.types.is_numeric_dtype(content):
            df[label] = content.astype('category').cat.as_ordered()
            df[label] = pd.Categorical(content).codes+1
            
    return df     


In [None]:
processed_data(Test_data)

In [None]:
max_cardinality = 100
high_cardinality = [col for col in Test_data.select_dtypes(exclude=np.number)
                   if Test_data[col].nunique() > max_cardinality]
Test_data = Test_data.drop(columns=high_cardinality)
Test_data.info()

In [None]:
set(Test_data.columns)-set(x_train.columns)

In [None]:
Test_data = Test_data.drop(['fiBaseModel','fiModelDesc','fiModelDescriptor','fiModelSeries','fiSecondaryDesc'],axis = 1)

# Predict on test data

In [None]:
test_preds = tuned_model.predict(Test_data)

# Submission

In [None]:
sub = pd.DataFrame()
sub["SalesID"] = Test_data["SalesID"]
sub["SalePrice"] = test_preds
sub