# Below are my best models and Preprocessing steps for the ASHRAE Great Energy predictor Kaggle Competition 
## The goal of the competition was to predict the Energy consumption of buildings for the given test set of buildings 
## The provided data consisted of 3 data files for training that were joined based on building_id, site_id, and timestamp 


### Import the Needed Libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from util import *
import time 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import explained_variance_score
from sklearn.svm import LinearSVR
import xgboost as xgb
from sklearn.decomposition import PCA 
import pickle 
from sklearn.pipeline import Pipeline
import lightgbm as lgb 
from catboost import CatBoostRegressor

### Preprocessing Functions 

In [None]:
# Runner function, when called will complete the whole preprocessing of the data 
# and return the preprocessed and split up data 
def run_it(file, rows):
    data = get_data(file, rows)
    x_train, x_test, y_train, y_test = process_data(data, 0.95)
    return x_train, x_test, y_train, y_test 

# This function creates new features with the existing data, One-Hot Encodes, 
# and then splits up the Features and Target variable
def feature_engine(df):
    df['Year'] = df['timestamp'].str[:4]
    df['Year'] = df['Year'].astype('int64')
    df['building_age'] = df['Year'] - df['year_built']

    df['Month'] = df['timestamp'].str[5:7]
    df['Month'] = df['Month'].astype('int64')
    df['Day'] = df['timestamp'].str[8:10]
    df['Day'] = df['Day'].astype('int64')
    
    # Remove the unique identifiers and unneeded columns
    cols = list(df.columns)
    cols.remove('meter_reading')
    cols.remove('Unnamed: 0')
#     cols.remove('building_id')
    cols.remove('timestamp')
    cols.remove('year_built')
#     cols.remove('site_id')
    cols.remove('Year')
#     cols.remove('Month')
    
    # One-hot encode based on the primary_use and meter type features 
    data_x = pd.get_dummies(df[cols], columns=['primary_use', 'meter', 'site_id'])
    
    # EDA showed a correlation between floor_count and square foot of the building, as you would expect 
    # This get the line gets the mean floor count based the binned square foot value and fills any null 
    # floor count spaces with that mean 
    data_x['binned_sqft'] = bin_sqft(data_x)
    data_x['floor_count'] = data_x.groupby('binned_sqft')['floor_count'].transform(lambda x: x.fillna(x.mean()))
    del data_x['binned_sqft']
    
    data_x['building_id_v2'] = data_x['building_id'].astype('category')
    data_x['building_age'] = data_x.groupby('building_id_v2')['building_age'].transform(lambda x: x.fillna(x.mean()))
    del data_x['building_id_v2']
    del data_x['building_id']
    # Create a column that bins the Square foot feature by interquartial range
    # to be used for filling null values of the floor count feature 

    return data_x

# Function to bin the Square foot feature based on interquartial range 
def bin_sqft(df):
    vals = df['square_feet'].describe()
    lst = []
    for i in range(len(df)):
        if df['square_feet'][i] >= vals['75%']:
            lst.append('Top')
        elif df['square_feet'][i] < vals['75%'] and df['square_feet'][i] >= vals['50%']:
            lst.append("High")
        elif df['square_feet'][i] < vals['50%'] and df['square_feet'][i] >= vals['25%']:
            lst.append('Middle')
        elif df['square_feet'][i] < vals['25%']:
            lst.append('Low')
    return pd.Series(lst)

# Function to read in a specified amount of the data 
def get_data(filename, row_num):
    df = pd.read_csv(filename, nrows=row_num)
    return df

# Function to called during te initial run_it function 
# Calls the feature engine function to select the specific features and one-hot encode 
# Defines the preprocessing pipeline to fill nulls, standardize features, and selct important features 
def process_data(df, pca_level):
    
    # Call feature engine function to get features 
    data_x = feature_engine(df)

    # data_x['building_id_v2'] = data_x['building_id'].astype('category')
    # data_x['building_age'] = data_x.groupby('building_id_v2').transform(lambda x: x.fillna(x.mean()))

    data_y = df['meter_reading']

    # Preprocessing Pipeline 
    PP_Pipeline = Pipeline([
        ('Imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), 
        ('Scaler', preprocessing.MinMaxScaler()), 
        ('PCA', PCA(n_components=pca_level)),
    ])
    
    # Train test split 
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, 
                                        test_size=0.3, random_state=4)
    
    # Run the train and test features through the Pipeline 
    x_train_pp = PP_Pipeline.fit_transform(x_train)
    x_test_pp = PP_Pipeline.transform(x_test)

    # PipelineFile = open("PipelineFile", "wb")
    # pickle.dump(PP_Pipeline, PipelineFile)
    # PipelineFile.close()

    print('\n')
    print('Completed Preprocessing and Dimensionality Reduction')
    print('\n')

    return x_train_pp, x_test_pp, y_train, y_test


x_train_pp, x_test_pp, y_train, y_test = run_it('Final_Data.csv', 5000000)

## XGBoost Model

In [4]:
# Hyperparameters 
n_est = 25
max_depth = 20
alpha = 11
learning_rate = 0.3

# XGBooost Random Forest Regresor Model 
xgb_reg_model = xgb.XGBRFRegressor(objective='reg:squarederror', colsample_bytree=1, 
                                    min_child_weight=2, max_depth=max_depth, 
                                    learning_rate=learning_rate, tree_method='hist', 
                                    n_estimators=n_est, alpha=alpha)

print("Beginning to Train XGBoost Model")

# My own personal curiosity into comparing how long my models take to train 
start = time.time()
xgb_reg_model.fit(x_train_pp, y_train)
end = time.time()
print('\n')
if (end-start)>=60:
    print("Training took approx. " + str((end-start)/60) + " minutes")
else:
    print("Training took approx. " + str(end-start) + " seconds")

# Make predictions on the testing set 
preds = xgb_reg_model.predict(x_test_pp)
preds = np.absolute(preds)

# Compute the target metric, in this case RMSLE 
print('RMSLE: ', np.sqrt(mean_squared_log_error(y_test, preds)))
print("N_estimators :", n_est)
print("Max_Depth: ", max_depth)
print("Alpha Val: ", alpha)
print("Learning Rate: ", learning_rate)

Beginning to Train XGBoost Model


  if getattr(data, 'base', None) is not None and \




Training took approx. 5.824481654167175 minutes
RMSLE:  1.6101240105571846
N_estimators : 25
Max_Depth:  20
Alpha Val:  11
Learning Rate:  0.3


## LightGBM Model

In [7]:
lgb_n_est = 20
num_leaves = 100
lgb_depth = 10
min_data_in_leaf = 500

# LightGBM Regression Model
LightGBM = lgb.sklearn.LGBMRegressor(boosting_type='gbdt', n_estimators=lgb_n_est, 
                                    num_leaves=num_leaves, max_depth=lgb_depth, 
                                     min_data_in_leaf = min_data_in_leaf)

print('Beginning to Train LightGBM Model')

start = time.time()
LightGBM.fit(x_train_pp, y_train)
end = time.time()

print('\n')
if (end-start)>=60:
    print("Training took approx. " + str((end-start)/60) + " minutes")
else:
    print("Training took approx. " + str(end-start) + " seconds")
    
# Make Predictions on the testing set with LightGBM model 
Light_Preds = LightGBM.predict(x_test_pp)
Light_Preds = np.absolute(Light_Preds)

# Compute the target metric for the LightGBM Predictions 
print('RMSLE: ', np.sqrt(mean_squared_log_error(y_test, Light_Preds)))
print("N_estimators :", lgb_n_est)
print("Max_Depth: ", lgb_depth)

Beginning to Train LightGBM Model


Training took approx. 5.595957279205322 seconds
RMSLE:  3.0893955867738305
N_estimators : 20
Max_Depth:  20
Alpha Val:  11
Learning Rate:  0.4


## CatBoost Model

In [None]:
CatB_Reg = CatBoostRegressor(iterations=10, learning_rate=0.4, 
                            depth=10, loss_function='RMSE', l2_leaf_reg=11)

print("Beginning to Train CatBoost Model")

start = time.time()
CatB_Reg.fit(x_train_pp, y_train)
end = time.time()

print('\n')
if (end-start)>=60:
    print("Training took approx. " + str((end-start)/60) + " minutes")
else:
    print("Training took approx. " + str(end-start) + " seconds")

# Make Predictions on the testing set with CatBoost model 
Cat_Preds = CatB_Reg.predict(x_test_pp)
# Cat_Preds = np.absolute(Light_Preds)

# Compute the target metric for the LightGBM Predictions 
print('RMSLE: ', np.sqrt(mean_squared_log_error(y_test, Cat_Preds)))
print("N_estimators :", n_est)
print("Max_Depth: ", max_depth)
print("Alpha Val: ", alpha)
print("Learning Rate: ", learning_rate)

## Voting Ensemble Model