In [0]:
# Required libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
import lightgbm as lgb
warnings.simplefilter('ignore')
matplotlib.rcParams['figure.dpi'] = 100
sns.set()
%matplotlib inline

In [4]:
#Start by connecting gdrive into the google colab
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#Loading data
building = pd.read_csv('gdrive/My Drive/ASHRAE/building_metadata.csv')
weather_train = pd.read_csv('gdrive/My Drive/ASHRAE/weather_train.csv')
weather_test = pd.read_csv('gdrive/My Drive/ASHRAE/weather_test.csv')
train = pd.read_csv('gdrive/My Drive/ASHRAE/train.csv')
test = pd.read_csv('gdrive/My Drive/ASHRAE/test.csv')

In [0]:
#Merging everything into two datasets: train and test
train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')
gc.collect();

In [0]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

In [0]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour
test["day"] = test["timestamp"].dt.day
test["weekend"] = test["timestamp"].dt.weekday
test["month"] = test["timestamp"].dt.month

In [0]:
train = train.drop("timestamp", axis = 1)
test = test.drop("timestamp", axis = 1)

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])
test["primary_use"] = le.fit_transform(test["primary_use"])

In [11]:
categoricals = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter"]
numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage","dew_temperature"]
feat_cols = categoricals + numericals
train[categoricals + numericals]
test[categoricals + numericals]

Unnamed: 0,building_id,primary_use,hour,day,weekend,month,meter,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature
0,0,0,0,1,6,1,0,7432,2008.0,17.8,4.0,11.7
1,1,0,0,1,6,1,0,2720,2004.0,17.8,4.0,11.7
2,2,0,0,1,6,1,0,5376,1991.0,17.8,4.0,11.7
3,3,0,0,1,6,1,0,23685,2002.0,17.8,4.0,11.7
4,4,0,0,1,6,1,0,116607,1975.0,17.8,4.0,11.7
...,...,...,...,...,...,...,...,...,...,...,...,...
41697595,1444,1,7,9,2,5,0,19619,1914.0,,,
41697596,1445,0,7,9,2,5,0,4298,,,,
41697597,1446,1,7,9,2,5,0,11265,1997.0,,,
41697598,1447,4,7,9,2,5,0,29775,2001.0,,,


In [0]:
target = np.log1p(train["meter_reading"])

In [0]:
train = train.drop(["site_id","floor_count"], axis = 1)
test = test.drop(["site_id","floor_count"], axis = 1)

In [0]:
#Based on this great kernel https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

In [15]:
train, NAlist = reduce_mem_usage(train)
test, NAlist = reduce_mem_usage(test)

Memory usage of properties dataframe is : 2776.2588500976562  MB
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  meter_reading
dtype before:  float64
min for this col:  0.0
max for this col:  21904700.0
dtype after:  float32
******************************
******************************
Column:  primary_use
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  square_feet
dtype before:  int64
min for this col:  283
max for this col:  875000
dtype after:  uint32
******************************
******************************
Column:  year_built
dtype before:  float64
min for

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np

In [0]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=12,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=0,
    verbose=0,
    warm_start=False,
)

In [23]:
rf_model.fit(train,target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features=12, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=True, random_state=0, verbose=0,
                      warm_start=False)

In [24]:
rf_model.feature_importances_

array([4.15728289e-03, 7.59651291e-03, 9.38214748e-01, 1.40478512e-04,
       4.95175723e-02, 7.64681345e-05, 1.22183046e-04, 0.00000000e+00,
       2.60255957e-05, 2.00070789e-05, 4.39790434e-05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       8.47425561e-05])

In [0]:
rf_predictions = rf_model.predict(test)

In [0]:
sub = pd.read_csv("gdrive/My Drive/ASHRAE/sample_submission.csv")

In [0]:
sub["meter_reading"] = rf_predictions

In [0]:
sub.to_csv("gdrive/My Drive/ASHRAE/submission-rf.csv", index = False)