In [82]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
from sklearn.model_selection import StratifiedShuffleSplit
import os
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.model_selection import train_test_split

In [76]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp

def check_dependency(df, ref_col, other_col):
    """
    This method takes in a dataframe and two column names. Then it runs
    a permutation test and returns a p value on whether the two columns are
    dependent.
    """
    #observed value
    gpA = df.loc[df[ref_col].isnull(), other_col]
    gpB = df.loc[df[ref_col].notnull(), other_col]
    obs = ks_2samp(gpA, gpB).statistic

    #permutation
    copy = df.copy()
    perm_results = []
    for i in range(100):
        copy[ref_col] = df[ref_col].sample(frac = 1, replace = False).reset_index(drop = True)
        gpA = copy.loc[copy[ref_col].isnull(), other_col]
        gpB = copy.loc[copy[ref_col].notnull(), other_col]
        perm_results.append(ks_2samp(gpA, gpB).statistic)
    pval = np.mean(np.array(perm_results) >= obs)
    return pval

def fill_floor_count(row, dict):
    """
    This method fill nans in floor_count.
    """
    if np.isnan(row.loc['floor_count']):
        return dict[row.loc['site_id']]
    return row.loc['floor_count']

def fill_year_built(row, dict):
    """
    This method fill nans in year_built.
    """
    if np.isnan(row.loc['year_built']):
        return dict[row.loc['site_id']]
    return row.loc['year_built']

def select_with_lin(lin_reg, all_combined, y):
    """
    This method selects the best feature according to the linear model.
    """
    r_sqr = {}
    for feat in all_combined.columns:
        X = all_combined[feat]
        lin_reg.fit(np.array(X).reshape(-1, 1), y)
        r_sqr[feat] = lin_reg.score(all_combined[[feat]], y)
    best_feat = max(r_sqr, key = r_sqr.get)
    return (best_feat, r_sqr[best_feat])

def feat_engi_test(test_train, weather_train, building_meta):
    """
    This method takes in three dataframes and conduct feature selection and
    engineering.
    """
    copy = test_train.copy()
    weather_building = weather_train.merge(building_meta, on = 'site_id', how = 'left')
    all_combined = copy.merge(weather_building, on = ['building_id', 'timestamp'], how = 'left')
    all_combined = all_combined.dropna()
    #test_meter = all_combined['meter_reading']
    X_y = all_combined[['meter_reading', 'air_temperature', 'square_feet', 'sea_level_pressure', 'wind_direction', 'dew_temperature']].dropna()
    X = X_y[['air_temperature', 'square_feet', 'sea_level_pressure', 'wind_direction', 'dew_temperature']]
    test_meter = X_y['meter_reading']

    return X, test_meter

def process_test(test, weather, building):
    test = test.reset_index(drop = True)
    test['timestamp'] = pd.to_datetime(test['timestamp'], format = "%Y-%m-%d %H:%M:%S")
    weather['timestamp'] = pd.to_datetime(weather['timestamp'], format = "%Y-%m-%d %H:%M:%S")
    return test, weather

def tree_reg_perf(X_train, y_train, X_test, y_test):
    result = []
    for i in range(21, 30):
        dtr = DecisionTreeRegressor(max_depth = i)
        dtr.fit(X_train, y_train)

        #train_err
        preds = dtr.predict(X_train)
        train_rmse = np.sqrt(np.mean((preds - y_train)**2))

        #test_err
        preds = dtr.predict(X_test)
        test_rmse = np.sqrt(np.mean((preds - y_test)**2))

        result.append([i, train_rmse, test_rmse])
    result = pd.DataFrame(result).set_index(0)
    result.columns = ['train_err', 'test_err']
    return result

In [6]:
# function for reducing df size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Loading Datasets

In [7]:
from subprocess import check_output
print(check_output(["ls", "../input/ashrae-energy-prediction"]).decode("utf8"))

building_metadata.csv
sample_submission.csv
test.csv
train.csv
weather_test.csv
weather_train.csv



In [8]:
fp = os.path.join('../input/ashrae-energy-prediction', 'train.csv')
train = pd.read_csv(fp)
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


We take 70% of the dataset as our train dataset, and 30% of it as our test dataset.

In [9]:
#form our smaller training dataset
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3)
for train_index, test_index in split.split(train, train['timestamp']):
    small_train = train.loc[train_index]
    test_train = train.loc[test_index]
display(small_train.head())
display(test_train.head())

Unnamed: 0,building_id,meter,timestamp,meter_reading
14352373,519,0,2016-09-18 14:00:00,1439.76
16405498,600,0,2016-10-24 22:00:00,64.5
6244425,926,1,2016-04-27 01:00:00,20.7494
8328546,1377,2,2016-06-03 02:00:00,350.443
7429238,218,0,2016-05-18 03:00:00,59.94


Unnamed: 0,building_id,meter,timestamp,meter_reading
15005955,185,0,2016-09-30 04:00:00,57.06
18083009,1331,2,2016-11-23 19:00:00,4713.28
1610669,220,0,2016-01-30 05:00:00,251.07
11079134,1097,0,2016-07-21 23:00:00,170.573
15795425,987,1,2016-10-14 02:00:00,825.334


In [10]:
small_train = small_train.reset_index(drop = True)
test_train = test_train.reset_index(drop = True)
display(small_train.head())
display(test_train.head())

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,519,0,2016-09-18 14:00:00,1439.76
1,600,0,2016-10-24 22:00:00,64.5
2,926,1,2016-04-27 01:00:00,20.7494
3,1377,2,2016-06-03 02:00:00,350.443
4,218,0,2016-05-18 03:00:00,59.94


Unnamed: 0,building_id,meter,timestamp,meter_reading
0,185,0,2016-09-30 04:00:00,57.06
1,1331,2,2016-11-23 19:00:00,4713.28
2,220,0,2016-01-30 05:00:00,251.07
3,1097,0,2016-07-21 23:00:00,170.573
4,987,1,2016-10-14 02:00:00,825.334


In [11]:
print(len(small_train)/len(train))
print(len(test_train)/len(train))

0.7
0.3


In [12]:
fp = os.path.join('../input/ashrae-energy-prediction', 'weather_train.csv')
weather_train = pd.read_csv(fp)
weather_train.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [13]:
fp = os.path.join('../input/ashrae-energy-prediction', 'building_metadata.csv')
building_meta = pd.read_csv(fp)
building_meta.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [14]:
# Reducing memory
small_train = reduce_mem_usage(small_train)
test_train = reduce_mem_usage(test_train)

weather_train = reduce_mem_usage(weather_train)
building_meta = reduce_mem_usage(building_meta)

Mem. usage decreased to 202.44 Mb (53.1% reduction)
Mem. usage decreased to 86.76 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


In [15]:
small_train['timestamp'] = pd.to_datetime(small_train['timestamp'], format = "%Y-%m-%d %H:%M:%S")
small_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,519,0,2016-09-18 14:00:00,1439.76001
1,600,0,2016-10-24 22:00:00,64.5
2,926,1,2016-04-27 01:00:00,20.749399
3,1377,2,2016-06-03 02:00:00,350.442993
4,218,0,2016-05-18 03:00:00,59.939999


In [16]:
test_train['timestamp'] = pd.to_datetime(test_train['timestamp'], format = "%Y-%m-%d %H:%M:%S")
test_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,185,0,2016-09-30 04:00:00,57.060001
1,1331,2,2016-11-23 19:00:00,4713.279785
2,220,0,2016-01-30 05:00:00,251.070007
3,1097,0,2016-07-21 23:00:00,170.572998
4,987,1,2016-10-14 02:00:00,825.333984


In [17]:
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], format = "%Y-%m-%d %H:%M:%S")
weather_train.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.5,0.0,0.0
1,0,2016-01-01 01:00:00,24.40625,,21.09375,-1.0,1020.0,70.0,1.5
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609


**Missingness Assessment of All_Combined**

In [None]:
weather_building = weather_train.merge(building_meta, on = 'site_id', how = 'left')
weather_building

We form a new dataset: small_weather by merging small_train and weather_train. Then we will analyze the missingness in the dataframe.

In [45]:
#form the dataset
all_combined = small_train.merge(weather_building, on = ['building_id', 'timestamp'], how = 'left')
all_combined.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,primary_use,square_feet,year_built,floor_count
0,519,0,2016-09-18 14:00:00,1439.76001,3.0,24.40625,,22.203125,0.0,1019.0,170.0,4.601562,Public services,461647.0,1976.0,
1,600,0,2016-10-24 22:00:00,64.5,4.0,19.40625,,11.101562,,1014.5,180.0,10.296875,Education,44182.0,1917.0,5.0
2,926,1,2016-04-27 01:00:00,20.749399,9.0,23.90625,,21.703125,0.0,1001.0,160.0,5.699219,Education,26033.0,,
3,1377,2,2016-06-03 02:00:00,350.442993,15.0,20.0,4.0,16.09375,,1016.0,180.0,3.099609,Education,74599.0,1984.0,
4,218,0,2016-05-18 03:00:00,59.939999,2.0,25.59375,,5.0,0.0,1006.5,330.0,12.398438,Public services,16411.0,,


In [48]:
#check the missingness of each column
all_combined.isnull().sum()

building_id                  0
meter                        0
timestamp                    0
meter_reading                0
site_id                  63604
air_temperature          67889
cloud_coverage         6179606
dew_temperature          70291
precip_depth_1_hr      2624738
sea_level_pressure      862384
wind_direction         1013672
wind_speed              100861
primary_use              63604
square_feet              63604
year_built             8543477
floor_count           11703909
dtype: int64

## Feature Selection and Engineering

In [54]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

Building a decision tree model.

In [55]:
#independent columns
all_combined.iloc[:,3].astype(float)
#X = all_combined.iloc[:, [(i != 'meter_reading') & (i != 'primary_use') & (i != 'timestamp') for i in all_combined.columns]]
X_y = all_combined[['meter_reading', 'air_temperature', 'square_feet', 'sea_level_pressure', 'wind_direction', 'dew_temperature']].dropna()
X = X_y[['air_temperature', 'square_feet', 'sea_level_pressure', 'wind_direction', 'dew_temperature']]
display(X.head())

#target column
y = X_y['meter_reading']
display(y.head())

Unnamed: 0,air_temperature,square_feet,sea_level_pressure,wind_direction,dew_temperature
0,24.40625,461647.0,1019.0,170.0,22.203125
1,19.40625,44182.0,1014.5,180.0,11.101562
2,23.90625,26033.0,1001.0,160.0,21.703125
3,20.0,74599.0,1016.0,180.0,16.09375
4,25.59375,16411.0,1006.5,330.0,5.0


0    1439.760010
1      64.500000
2      20.749399
3     350.442993
4      59.939999
Name: meter_reading, dtype: float32

Do feature selection and engineering to test_train as we did to small_train.

In [60]:
test_train_copy, test_meter = feat_engi_test(test_train, weather_train, building_meta)
test_train_copy.head()

Unnamed: 0,air_temperature,square_feet,sea_level_pressure,wind_direction,dew_temperature
50,17.203125,86465.0,1011.5,150.0,12.203125
66,2.199219,220703.0,1029.0,60.0,-1.700195
89,22.796875,68146.0,1015.5,250.0,11.101562
132,13.296875,220703.0,1023.0,0.0,8.296875
204,20.0,15326.0,1012.0,210.0,13.296875


Building a linear model.

In [63]:
lin_reg = LinearRegression()
lin_reg

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [65]:
print(test_train_copy.shape)
test_train['meter_reading'].shape

(96531, 5)


(6064830,)

In [66]:
lin_reg.fit(X, y)
lin_reg.score(test_train_copy, test_meter)

-120.88631269679563

In [67]:
preds = lin_reg.predict(test_train_copy)
rmse = np.sqrt(np.mean((preds - test_meter)**2))
rmse

3156.425923633412

A Decision Tree Regressor.

In [68]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [77]:
tree_reg_perf(X, y, test_train_copy, test_meter)

Unnamed: 0_level_0,train_err,test_err
0,Unnamed: 1_level_1,Unnamed: 2_level_1
21,97829.426619,79.454525
22,97131.200555,75.35898
23,96792.401694,67.562846
24,96571.767716,67.484183
25,96496.799746,70.279251
26,96451.998817,66.659111
27,96436.608848,62.193843
28,96430.927914,58.300894
29,96429.66413,58.310982


24 is the best tree depth, so we adjust our model and fit.

In [78]:
dtr = DecisionTreeRegressor(max_depth = 24)
dtr

DecisionTreeRegressor(criterion='mse', max_depth=24, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [79]:
dtr.fit(X, y)
preds = dtr.predict(test_train_copy)
rmse = np.sqrt(np.mean((preds - test_meter)**2))
rmse

67.06724009412702

In [83]:
eclf = EnsembleVoteClassifier(clfs=[lin_reg, dtr], voting='hard')
eclf

EnsembleVoteClassifier(clfs=[LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=None, normalize=False),
                             DecisionTreeRegressor(criterion='mse',
                                                   max_depth=24,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort=False,
                                                   random_state=None,
                                                   spli

In [85]:
display(X.head())
test_train_copy.head()

Unnamed: 0,air_temperature,square_feet,sea_level_pressure,wind_direction,dew_temperature
0,24.40625,461647.0,1019.0,170.0,22.203125
1,19.40625,44182.0,1014.5,180.0,11.101562
2,23.90625,26033.0,1001.0,160.0,21.703125
3,20.0,74599.0,1016.0,180.0,16.09375
4,25.59375,16411.0,1006.5,330.0,5.0


Unnamed: 0,air_temperature,square_feet,sea_level_pressure,wind_direction,dew_temperature
50,17.203125,86465.0,1011.5,150.0,12.203125
66,2.199219,220703.0,1029.0,60.0,-1.700195
89,22.796875,68146.0,1015.5,250.0,11.101562
132,13.296875,220703.0,1023.0,0.0,8.296875
204,20.0,15326.0,1012.0,210.0,13.296875


In [88]:
eclf.fit(X, y)

EnsembleVoteClassifier(clfs=[LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=None, normalize=False),
                             DecisionTreeRegressor(criterion='mse',
                                                   max_depth=24,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort=False,
                                                   random_state=None,
                                                   spli

In [89]:
preds = eclf.predict(test_train_copy.head(1))
rmse = np.sqrt(np.mean((preds - test_meter)**2))
rmse

TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'

Now we start to predict the test data using our fitted decision tree regressor.

In [90]:
import pandas as pd
sample_submission = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")

In [91]:
display(test.head())
display(weather_test.head())
display(building_meta.head())

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01 00:00:00
1,1,1,0,2017-01-01 00:00:00
2,2,2,0,2017-01-01 00:00:00
3,3,3,0,2017-01-01 00:00:00
4,4,4,0,2017-01-01 00:00:00


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.8,4.0,11.7,,1021.4,100.0,3.6
1,0,2017-01-01 01:00:00,17.8,2.0,12.8,0.0,1022.0,130.0,3.1
2,0,2017-01-01 02:00:00,16.1,0.0,12.8,0.0,1021.9,140.0,3.1
3,0,2017-01-01 03:00:00,17.2,0.0,13.3,0.0,1022.2,140.0,3.1
4,0,2017-01-01 04:00:00,16.7,2.0,13.3,0.0,1022.3,130.0,2.6


Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [92]:
# Reducing memory
test = reduce_mem_usage(test)
weather_test = reduce_mem_usage(weather_test)

Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)


In [93]:
test, weather = process_test(test, weather_test, building_meta)
display(test.head())
display(weather.head())

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01
1,1,1,0,2017-01-01
2,2,2,0,2017-01-01
3,3,3,0,2017-01-01
4,4,4,0,2017-01-01


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,0,2017-01-01 01:00:00,17.796875,2.0,12.796875,0.0,1022.0,130.0,3.099609
2,0,2017-01-01 02:00:00,16.09375,0.0,12.796875,0.0,1022.0,140.0,3.099609
3,0,2017-01-01 03:00:00,17.203125,0.0,13.296875,0.0,1022.0,140.0,3.099609
4,0,2017-01-01 04:00:00,16.703125,2.0,13.296875,0.0,1022.5,130.0,2.599609


In [94]:
weather_building = weather.merge(building_meta, on = 'site_id', how = 'left')
all_combined = test.merge(weather_building, on = ['building_id', 'timestamp'], how = 'left')
all_combined.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,primary_use,square_feet,year_built,floor_count
0,0,0,0,2017-01-01,0.0,17.796875,4.0,11.703125,,1021.5,100.0,3.599609,Education,7432.0,2008.0,
1,1,1,0,2017-01-01,0.0,17.796875,4.0,11.703125,,1021.5,100.0,3.599609,Education,2720.0,2004.0,
2,2,2,0,2017-01-01,0.0,17.796875,4.0,11.703125,,1021.5,100.0,3.599609,Education,5376.0,1991.0,
3,3,3,0,2017-01-01,0.0,17.796875,4.0,11.703125,,1021.5,100.0,3.599609,Education,23685.0,2002.0,
4,4,4,0,2017-01-01,0.0,17.796875,4.0,11.703125,,1021.5,100.0,3.599609,Education,116607.0,1975.0,


In [95]:
#independent columns
X = all_combined[['air_temperature', 'square_feet', 'sea_level_pressure', 'wind_direction', 'dew_temperature']].dropna()
display(X.head())

Unnamed: 0,air_temperature,square_feet,sea_level_pressure,wind_direction,dew_temperature
0,17.796875,7432.0,1021.5,100.0,11.703125
1,17.796875,2720.0,1021.5,100.0,11.703125
2,17.796875,5376.0,1021.5,100.0,11.703125
3,17.796875,23685.0,1021.5,100.0,11.703125
4,17.796875,116607.0,1021.5,100.0,11.703125


In [99]:
preds = dtr.predict(X)
preds

array([ 10.49923455,   0.        ,   1.05970395, ...,  63.30888244,
       261.30808021,   3.4083213 ])

In [100]:
np.mean(preds > 0)

0.9875540899406282