In [None]:
#blue book bulldozer competition from kaggle. Datasets used here were taken directly from kaggle
#import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

#import the data, both training and validation sets.
#df = pd.read_csv("TrainAndValid.csv", low_memory = False)
#df.head()
#df.info()

#reimport data insuring sale date column is passed as a date
df = pd.read_csv("TrainAndValid.csv", low_memory = False, parse_dates = ["saledate"])
df.saledate.dtype #checking the type

#sort dataframe by date
#good idea to sort date data by date 
df.sort_values(by = ["saledate"], inplace = True,ascending = True) #inplace will sort of "reassign"
df.saledate.head

df_tmp = df.copy() #make copy of df so that original is safe

#feature engineering
##break down saledate column into day,year,month etc so we convert from timeseries representation
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["saleMonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"] = df_tmp.saledate.dt.day
df_tmp["saleDayOfWeek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayOfYear"] = df_tmp.saledate.dt.dayofyear

#drop actual sale date column
df_tmp.drop(["saledate"], axis = 1, inplace = True)

#viewing our reformed dataframe
#df_tmp.head().T
len(df_tmp)

#Model based EDA, converting our data into numbers
##firstly, lets find columns which contain strings
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)
        
#turning all the string values into categoricals
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()

df_tmp.info() #we notice that all our string objects are now categorical objects
#pandas will now treat categorical values as numerics somehow underthe hood. We have to now take care of the missing values.

#Save our current preprocessed dataframe
df_tmp.to_csv("train_temp.csv", index = False)

#----------------------------------------------------------------------------------------------------------------------------

#filling in numerical missing values
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

#df_tmp.ModelID

#search for actual numeric vars that have missing points
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

#lets fill with median (more robust than mean, not sensitive to outliers)
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            #add binary column which tells us if the data was missing
            df_tmp[label+"_is_missing"] = pd.isnull(content)
            #fill missing numeric values with median
            df_tmp[label] = content.fillna(content.median())

#check to see if we still have any columns missing numerics
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label) #does not print out anything so we are sorted.
            
#filling the categoricals 
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        #add a binary column to indicate whether sample had missing value
        df_tmp[label+"_is_missing"] = pd.isnull(content)
        #turn cats into numbers and add +1, because missing values get given value -1, we want that to be 0.
        df_tmp[label] = pd.Categorical(content).codes + 1

#----------------------------------------------------------------------------------------------------------------------
#split the data by saleyear
df_val = df_tmp[df_tmp.saleYear == 2012]
df_train = df_tmp[df_tmp.saleYear != 2012]

len(df_val) , len(df_train)

#split data into x and y
X_train, y_train = df_train.drop("SalePrice", axis = 1), df_train.SalePrice
X_valid, y_valid = df_val.drop("SalePrice", axis = 1), df_val.SalePrice

#building and evaluation metric
##create an evaluation function that we can use its functionality many times
##import some metrics
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    """
    calculates root mean squared log error between predictions and true lables
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

#create func to eval model on different levels 
#we expect valuation metrics to be better on the training data than validation (overfit if other way round)
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Traing MAE": mean_absolute_error(y_train, train_preds),
             "Valid MAE:": mean_absolute_error(y_valid,val_preds),
             "Training RMSLE":rmsle(y_train,train_preds),
             "Valid RMSLE":rmsle(y_valid,val_preds)}
    
    return scores

#model building
#we will build on a subset of out data so that we are able to cut on run time and also use part of the data to tune hyperpars
#we do this because we have alot of data.
#in random forest, we will change max_samples value parameters

model = RandomForestRegressor(n_jobs = -1, random_state = 42, max_samples = 10000) #every n_estimator will see the data/patterns of the data 10000 times
model.fit(X_train, y_train)

#evaluate performance
show_scores(model) #results as expected, we are not overfitting for these samples

#Can we imporve these metrics? 
#Hyperparameter tuning with Randomized search cv. Could improve or worsen our metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

#different RandomForestRegressor hyperpars
rf_grid = {"n_estimators": np.arange(10,100,1000),
          "max_depth": [None,3,5,10],
          "min_samples_split": np.arange(2,20,2),
          "min_samples_leaf": np.arange(1,20,2),
          "max_features": [0.5,1,"sqrt","auto"],
          "max_samples": [10000]}

#instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs = -1, random_state = 42), param_distributions = rf_grid, cv = 5, n_iter = 5, verbose = True)

#fit the randomized search cv model
rs_model.fit(X_train, y_train)

#find the best model hyperpars
rs_model.best_params_
show_scores(rs_model) 

#----------------------------------------------------------------------------------------------------------------------------------------------------------
#test dataset
#create function to also preprocess our test dataset same way we did for train and validation
def preprocess_data(df):
    """
    performs transformations on df and returns transformed dataframe
    
    """
    df.saledate.dtype
    df.sort_values(by = ["saledate"], inplace = True,ascending = True)
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayOfWeek"] = df.saledate.dt.dayofweek
    df["saleDayOfYear"] = df.saledate.dt.dayofyear

    #drop actual sale date column
    df.drop("saledate", axis = 1, inplace = True)
    
    #fill numeric rows to median
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                #add binary column which tells us if the data was missing
                df[label+"_is_missing"] = pd.isnull(content)
                #fill missing numeric values with median
                df[label] = content.fillna(content.median())
    
    #fill categorical missing data and turn cats into numbers
        if not pd.api.types.is_numeric_dtype(content):
            #add a binary column to indicate whether sample had missing value
            df[label+"_is_missing"] = pd.isnull(content)
            #turn cats into numbers and add +1, because missing values get given value -1, we want that to be 0.
            df[label] = pd.Categorical(content).codes + 1

    
    return df

#import test dataset
df_test = pd.read_csv("Test.csv",low_memory = False,parse_dates = ["saledate"])
df_test.head().T

#process the test data
df_test = preprocess_data(df_test)
df_test.head()

#make a prediction
test_preds = rs_model.predict(df_test)

#the above line throws out an error, the shape of xtrain and our test data differs by 1 column.
#find the missing column as below

set(X_train.columns) - set(df_test.columns)

#auctioneerID is missing, we manually adjust df_test to include that missing column, or you could omit it from our train phase data.
df_test["auctioneerID_is_missing"] = False
df_test.head()

#we can now predict as both dataframes (test and train) have the same features
test_preds = rs_model.predict(df_test)

#format predictions into format that kaggle wants so we can submit and see where we place on the leaderboard
df_preds = pd.DataFrame() #create an empty dataframe
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalesPrice"] = test_preds
df_preds