Route: Goal 2 -> 2.1 Feature Engineering#1   
Purpose: to implement feature engineering(full-dummy variables)  
Kernel: R

In [1]:
library(data.table)
library(nlme)
library(magrittr)

In [2]:
get.bike.data = function(data_dir){
  bike = read.csv(sprintf('%s/day.csv', data_dir), stringsAsFactors = FALSE)

  bike$weekday = factor(bike$weekday, levels=0:6
                        , labels = c('SUN', 'MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT'))
  bike$holiday = factor(bike$holiday, levels = c(0,1), labels = c('0', '1'))
  bike$workingday = factor(bike$workingday, levels = c(0,1), labels = c('0', '1'))
  bike$season = factor(bike$season, levels = 1:4, labels = c('WINTER', 'SPRING', 'SUMMER', 'FALL'))
  bike$weathersit = factor(bike$weathersit, levels = 1:3, labels = c('GOOD', 'MISTY', 'RAIN/SNOW/STORM'))
  bike$mnth = factor(bike$mnth, levels = 1:12, labels = c('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OKT', 'NOV', 'DEZ'))
  bike$yr[bike$yr == 0] = 2011
  bike$yr[bike$yr == 1] = 2012
  bike$yr = factor(bike$yr)
  bike$days_since_2011 = day_diff(bike$dteday, min(as.Date(bike$dteday)))

  bike$temp = bike$temp * (39 - (-8)) + (-8)
  bike$atemp = bike$atemp * (50 - (16)) + (16)

  bike$windspeed = 67 * bike$windspeed
  bike$hum = 100 * bike$hum

  dplyr::select(bike, -instant, -dteday, -registered, -casual)
}

In [3]:
day_diff = function(date1, date2){
  as.numeric(difftime(as.Date(date1), as.Date(date2), units = 'days'))
}

In [4]:
bike = get.bike.data(sprintf('%s/DataSet', getwd()))
bike <- data.table(bike)

In [5]:
bike_onehot <- setDT(bike) %>%
  .[grepl("SPRING", season), c("season_SPRING","season_SUMMER", "season_FALL", "season_WINTER") := .(1,0,0,0)] %>%
  .[grepl("SUMMER", season), c("season_SPRING","season_SUMMER", "season_FALL", "season_WINTER") := .(0,1,0,0)] %>%
  .[grepl("FALL", season), c("season_SPRING","season_SUMMER", "season_FALL", "season_WINTER") := .(0,0,1,0)] %>%
  .[grepl("WINTER", season), c("season_SPRING","season_SUMMER", "season_FALL", "season_WINTER") := .(0,0,0,1)] 
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(season))

bike_onehot <- setDT(bike_onehot) %>%
  .[grepl("2011", yr), c("yr_2011","yr_2012") := .(1,0)] %>%
  .[grepl("2012", yr), c("yr_2011","yr_2012") := .(0,1)]
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(yr))
bike_onehot <- subset(bike_onehot, select = -c(yr_2012))

bike_onehot <- setDT(bike_onehot) %>%
  .[grepl("JAN", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(1,0,0,0,0,0,0,0,0,0,0,0)] %>%
  .[grepl("FEB", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,1,0,0,0,0,0,0,0,0,0,0)] %>%
  .[grepl("MAR", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,1,0,0,0,0,0,0,0,0,0)] %>%
  .[grepl("APR", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,1,0,0,0,0,0,0,0,0)] %>%
  .[grepl("MAY", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,1,0,0,0,0,0,0,0)] %>%
  .[grepl("JUN", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,1,0,0,0,0,0,0)] %>%
  .[grepl("JUL", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,0,1,0,0,0,0,0)] %>%
  .[grepl("AUG", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,0,0,1,0,0,0,0)] %>%
  .[grepl("SEP", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,0,0,0,1,0,0,0)] %>%
  .[grepl("OKT", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,0,0,0,0,1,0,0)] %>%
  .[grepl("NOV", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,0,0,0,0,0,1,0)] %>%
  .[grepl("DEZ", mnth), c("mnth_JAN","mnth_FEB","mnth_MAR","mnth_APR","mnth_MAY","mnth_JUN","mnth_JUL","mnth_AUG","mnth_SEP","mnth_OKT","mnth_NOV","mnth_DEZ") := .(0,0,0,0,0,0,0,0,0,0,0,1)]
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(mnth))

bike_onehot <- setDT(bike_onehot) %>%
  .[grepl("0", holiday), c("holiday_NO_HOLIDAY","holiday_HOLIDAY") := .(1,0)] %>%
  .[grepl("1", holiday), c("holiday_NO_HOLIDAY","holiday_HOLIDAY") := .(0,1)]
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(holiday))
bike_onehot <- subset(bike_onehot, select = -c(holiday_NO_HOLIDAY))

bike_onehot <- setDT(bike_onehot) %>%
  .[grepl("SUN", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(1,0,0,0,0,0,0)] %>%
  .[grepl("MON", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(0,1,0,0,0,0,0)] %>%
  .[grepl("TUE", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(0,0,1,0,0,0,0)] %>%
  .[grepl("WED", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(0,0,0,1,0,0,0)] %>%
  .[grepl("THU", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(0,0,0,0,1,0,0)] %>%
  .[grepl("FRI", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(0,0,0,0,0,1,0)] %>%
  .[grepl("SAT", weekday), c("weekday_SUN","weekday_MON","weekday_TUE","weekday_WED","weekday_THU","weekday_FRI","weekday_SAT") := .(0,0,0,0,0,0,1)]
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(weekday))

bike_onehot <- setDT(bike_onehot) %>%
  .[grepl("0", workingday), c("holiday_NO_WORKING_DAY","holiday_WORKING_DAY") := .(1,0)] %>%
  .[grepl("1", workingday), c("holiday_NO_WORKING_DAY","holiday_WORKING_DAY") := .(0,1)]
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(workingday))
bike_onehot <- subset(bike_onehot, select = -c(holiday_WORKING_DAY))

bike_onehot <- setDT(bike_onehot) %>%
  .[grepl("GOOD", weathersit), c("weathersit_GOOD","weathersit_MISTY","weathersit_BAD") := .(1,0,0)] %>%
  .[grepl("MISTY", weathersit), c("weathersit_GOOD","weathersit_MISTY","weathersit_BAD") := .(0,1,0)] %>%
  .[grepl("RAIN/SNOW/STORM", weathersit), c("weathersit_GOOD","weathersit_MISTY","weathersit_BAD") := .(0,0,1)]
bike_onehot <- data.table(bike_onehot)
bike_onehot <- subset(bike_onehot, select = -c(weathersit))

bike_onehot

temp,atemp,hum,windspeed,cnt,days_since_2011,season_SPRING,season_SUMMER,season_FALL,season_WINTER,⋯,weekday_MON,weekday_TUE,weekday_WED,weekday_THU,weekday_FRI,weekday_SAT,holiday_NO_WORKING_DAY,weathersit_GOOD,weathersit_MISTY,weathersit_BAD
<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
8.175849,28.36325,80.5833,10.749882,985,0,0,0,0,1,⋯,0,0,0,0,0,1,1,0,1,0
9.083466,28.02713,69.6087,16.652113,801,1,0,0,0,1,⋯,0,0,0,0,0,0,1,0,1,0
1.229108,22.43977,43.7273,16.636703,1349,2,0,0,0,1,⋯,1,0,0,0,0,0,0,1,0,0
1.400000,23.21215,59.0435,10.739832,1562,3,0,0,0,1,⋯,0,1,0,0,0,0,0,1,0,0
2.666979,23.79518,43.6957,12.522300,1600,4,0,0,0,1,⋯,0,0,1,0,0,0,0,1,0,0
1.604356,23.92911,51.8261,6.000868,1606,5,0,0,0,1,⋯,0,0,0,1,0,0,0,1,0,0
1.236534,23.10053,49.8696,11.304642,1510,6,0,0,0,1,⋯,0,0,0,0,1,0,0,0,1,0
-0.245000,21.51664,53.5833,17.875868,959,7,0,0,0,1,⋯,0,0,0,0,0,1,1,0,1,0
-1.498349,19.94995,43.4167,24.250650,822,8,0,0,0,1,⋯,0,0,0,0,0,0,1,1,0,0
-0.910849,21.13019,48.2917,14.958889,1321,9,0,0,0,1,⋯,1,0,0,0,0,0,0,1,0,0


In [6]:
# Switch the place of cnt
bike_final <- subset(bike_onehot, select = -c(cnt))
y1 = bike_onehot[,'cnt']
bike_final = cbind(bike_final, y1)
bike_final

temp,atemp,hum,windspeed,days_since_2011,season_SPRING,season_SUMMER,season_FALL,season_WINTER,yr_2011,⋯,weekday_TUE,weekday_WED,weekday_THU,weekday_FRI,weekday_SAT,holiday_NO_WORKING_DAY,weathersit_GOOD,weathersit_MISTY,weathersit_BAD,cnt
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
8.175849,28.36325,80.5833,10.749882,0,0,0,0,1,1,⋯,0,0,0,0,1,1,0,1,0,985
9.083466,28.02713,69.6087,16.652113,1,0,0,0,1,1,⋯,0,0,0,0,0,1,0,1,0,801
1.229108,22.43977,43.7273,16.636703,2,0,0,0,1,1,⋯,0,0,0,0,0,0,1,0,0,1349
1.400000,23.21215,59.0435,10.739832,3,0,0,0,1,1,⋯,1,0,0,0,0,0,1,0,0,1562
2.666979,23.79518,43.6957,12.522300,4,0,0,0,1,1,⋯,0,1,0,0,0,0,1,0,0,1600
1.604356,23.92911,51.8261,6.000868,5,0,0,0,1,1,⋯,0,0,1,0,0,0,1,0,0,1606
1.236534,23.10053,49.8696,11.304642,6,0,0,0,1,1,⋯,0,0,0,1,0,0,0,1,0,1510
-0.245000,21.51664,53.5833,17.875868,7,0,0,0,1,1,⋯,0,0,0,0,1,1,0,1,0,959
-1.498349,19.94995,43.4167,24.250650,8,0,0,0,1,1,⋯,0,0,0,0,0,1,1,0,0,822
-0.910849,21.13019,48.2917,14.958889,9,0,0,0,1,1,⋯,0,0,0,0,0,0,1,0,0,1321


In [7]:
write.csv(bike_final, sprintf('%s/DataSet/bike_FE1.csv', getwd()))