In [24]:
from data_preprocessing import decompress_pickle, compressed_pickle
import pandas as pd
from sklearn import preprocessing

In [34]:
df = decompress_pickle("../data/preprocessed/BikeRental_complete.pbz2")

In [35]:
df.head()

Unnamed: 0,datetime,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2011-01-01 00:00:00,2011-01-01,1.0,0.0,1,0,0.0,5,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0
1,2011-01-01 01:00:00,2011-01-01,1.0,0.0,1,1,0.0,5,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0
2,2011-01-01 02:00:00,2011-01-01,1.0,0.0,1,2,0.0,5,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0
3,2011-01-01 03:00:00,2011-01-01,1.0,0.0,1,3,0.0,5,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0
4,2011-01-01 04:00:00,2011-01-01,1.0,0.0,1,4,0.0,5,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0


### drop leakage variables

In [36]:
leak_var = ["casual", "registered"]
df = df.drop(leak_var, axis=1)

In [37]:
df.columns

Index(['datetime', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday',
       'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum',
       'windspeed', 'cnt'],
      dtype='object')

### drop highly correlated variables (temp and atemp have Pearson correlation of 0.99)

In [10]:
high_corr_var = ["atemp"]
df = df.drop(high_corr_var, axis=1)

### drop redundant dteday variable

In [11]:
red_var = ["dteday"]
df = df.drop(red_var, axis=1)

In [12]:
df.dtypes

datetime      datetime64[ns]
season               float64
yr                   float64
mnth                   int64
hr                     int64
holiday              float64
weekday                int64
workingday           float64
weathersit           float64
temp                 float64
hum                  float64
windspeed            float64
cnt                  float64
dtype: object

### coerce correct data types

In [13]:
cat_var = ["season", "yr", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit"]

for v in cat_var:
    df[v] = df[v].astype("category")

In [14]:
df.dtypes

datetime      datetime64[ns]
season              category
yr                  category
mnth                category
hr                  category
holiday             category
weekday             category
workingday          category
weathersit          category
temp                 float64
hum                  float64
windspeed            float64
cnt                  float64
dtype: object

### normalize continous variables

In [15]:
cont_var = ["temp", "windspeed", "cnt"]

In [19]:
df[cont_var].min()

temp         0.02
windspeed    0.00
cnt          1.00
dtype: float64

In [18]:
df[cont_var].max()

temp           1.0000
windspeed      0.8507
cnt          977.0000
dtype: float64

In [22]:
# store cnt max mins
count_var = ["cnt"]
max_count = pd.DataFrame(df[count_var].max())
min_count = pd.DataFrame(df[count_var].min())
max_min_count = pd.concat([max_count, min_count], axis=1)
max_min_count.columns = ["max", "min"]

In [28]:
mm_scaler = preprocessing.MinMaxScaler()
df[cont_var] = mm_scaler.fit_transform(df[cont_var])

In [30]:
df[cont_var].min()

temp         0.0
windspeed    0.0
cnt          0.0
dtype: float64

In [31]:
df[cont_var].max()

temp         1.0
windspeed    1.0
cnt          1.0
dtype: float64

In [32]:
df.head()

Unnamed: 0,datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt
0,2011-01-01 00:00:00,1.0,0.0,1,0,0.0,5,0.0,1.0,0.22449,0.81,0.0,0.015369
1,2011-01-01 01:00:00,1.0,0.0,1,1,0.0,5,0.0,1.0,0.204082,0.8,0.0,0.039959
2,2011-01-01 02:00:00,1.0,0.0,1,2,0.0,5,0.0,1.0,0.204082,0.8,0.0,0.031762
3,2011-01-01 03:00:00,1.0,0.0,1,3,0.0,5,0.0,1.0,0.22449,0.75,0.0,0.012295
4,2011-01-01 04:00:00,1.0,0.0,1,4,0.0,5,0.0,1.0,0.22449,0.75,0.0,0.0
