In [6]:
#By Emmanuel Cocom
import pandas as pd
import numpy as np

### READ RAW DATA

In [7]:
#link to problem on kaggle
#https://www.kaggle.com/c/bike-sharing-demand/data

#get untouched dataset
bycle_df = pd.read_csv('train.csv')

#columns
bycle_df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [8]:
#shape
bycle_df.shape

(10886, 12)

In [9]:
#df printed out to see raw data
bycle_df[:350:50]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,1/1/11 0:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
50,1/3/11 5:00,1,0,1,1,6.56,6.82,47,19.0012,0,3,3
100,1/5/11 9:00,1,0,1,1,9.02,9.85,37,22.0028,6,109,115
150,1/7/11 13:00,1,0,1,2,8.2,9.09,37,19.0012,9,64,73
200,1/9/11 15:00,1,0,0,1,9.02,9.85,35,23.9994,5,77,82
250,1/11/11 19:00,1,0,1,3,6.56,11.365,93,0.0,0,51,51
300,1/13/11 23:00,1,0,1,1,4.92,6.82,50,12.998,1,14,15


### MUST DROP BOTH CASUAL AND REGISTERED COLUMNS.  LEAKAGE VARIABLES 

They are another representation of the label we are trying to predict 'count' in two columns

registered + casual = count (label)

we will not know registered, casual data in ever in any real life testing because they are the label 'count' we are trying predict! 

Keeping them jeapordizes the integrity and usefulness of our model as it will rely on those columns to predict the label, but they are the label

In [10]:
#Drop casual
bycle_df.drop('casual', axis=1, inplace=True)

#DROP registered
bycle_df.drop('registered', axis=1, inplace=True)
              
              
bycle_df.columns
#both casual and registered are gone

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'count'],
      dtype='object')

## Unsupported Formats:    Datetime column cannot be used in it's current format:  date hour:00.

#### Data will be extracted from datetime column to form three new columns: 

hour : 0-23

day_of_year: 0 - 365 (but raw data only provides up to 364)

weekday: 0-6 (0-Sun, 6-Sat)

In [11]:
from datetime import datetime

#extract hour from datetime
bycle_df['hour'] = bycle_df.datetime.apply(lambda x : x.split()[1].split(':')[0])

#extract day_of_year from datetime
bycle_df['day_of_year'] = bycle_df.datetime.apply(lambda x : datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%-j'))

#extract weekday from datetime
bycle_df['weekday']= bycle_df.datetime.apply(lambda x : datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%w'))

#bycle_df.datetime.apply(lambda x : datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%w'))

#extract weekday from datetime
bycle_df['year']= bycle_df.datetime.apply(lambda x : datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%Y')).to_frame('year')

#extract weekday from datetime
bycle_df['month']= bycle_df.datetime.apply(lambda x : datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%-m')).to_frame('month')


bycle_df.head()

#adding month year features to see if model improves if we can predict by month or year 
#year_df = bycle_df.datetime.apply(lambda x : datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%Y')).to_frame('year')
#datetime_df = bycle_df.datetime.apply(lambda x: datetime.strptime(x,  '%m/%d/%y %H:00').strftime('%-m')).to_frame('month')
#month_year_df = pd.concat([year_df ,datetime_df ], axis=1)
#month_year_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,hour,day_of_year,weekday,year,month
0,1/1/11 0:00,1,0,0,1,9.84,14.395,81,0.0,16,0,1,6,2011,1
1,1/1/11 1:00,1,0,0,1,9.02,13.635,80,0.0,40,1,1,6,2011,1
2,1/1/11 2:00,1,0,0,1,9.02,13.635,80,0.0,32,2,1,6,2011,1
3,1/1/11 3:00,1,0,0,1,9.84,14.395,75,0.0,13,3,1,6,2011,1
4,1/1/11 4:00,1,0,0,1,9.84,14.395,75,0.0,1,4,1,6,2011,1


### MISSING VALUES and Corrupt data values

In [12]:
#HELPER FUNCTIONS, TO TEST FOR MISSING OR CORRUPT DATA IN EACH COLUMN/FEATURE

#helper for string data
def check_if_missing_strings(value):
    if value == None or value.strip() == '' or value.lower() == 'nan' or value == np.nan:
        print(value, j, ' is the missing value')
        return np.nan
    else:
        return value
    

    
#Helper for numerical data
def check_if_empty_ints(value):
    if value == '' or value == None or value == 'nan' or value == np.nan:
        print('missing value')
    else: 
        pass
    
def check_if_less_one(value):
    if value <1:
        print('less than one, check it out!')
        
def check_if_greater_than(value, limit):
    if  value > limit:
        print('less than one, check it out!')
def check_if_less_than(value, limit):
    if value < limit:
        print('less than one, check it out!')
def check_if_less_than_x_greater_than_y(value, x_limit, y_limit):
    if value < x_limit:
        print(value)
        print(' is dangerous data and less than ', x_limit)
    if value > y_limit:
        print(value)
        print(' is dangerous data and greater than ', y_limit)

#checking for both missing and corrupt data( data that does not make sense )
#datetime stamp :: clear
bycle_df.datetime.apply(lambda x: check_if_missing_strings(x)) #results show NO missing values

#season :: clear
bycle_df.season.apply(lambda x: check_if_empty_ints(x))#results show NO missing values
bycle_df.season.apply(lambda x: check_if_less_one(x))#results show NO corrupt values

#holiday :: clear
bycle_df.holiday.apply(lambda x: check_if_empty_ints(x))#results show NO missing values
bycle_df.holiday.apply(lambda x: check_if_less_than_x_greater_than_y(x, 0, 1))#results show NO corrupt values
bycle_df.holiday.unique()

#workingday:: clear
bycle_df.workingday.apply(lambda x: check_if_empty_ints(x))#results show NO missing values
bycle_df.workingday.apply(lambda x: check_if_less_than_x_greater_than_y(x, 0, 1))#results show NO corrupt values
bycle_df.workingday.unique()

#weather:: clear
bycle_df.weather.apply(lambda x: check_if_empty_ints(x))#results show NO missing values
bycle_df.weather.apply(lambda x: check_if_less_than_x_greater_than_y(x, 1, 4))#results show NO corrupt values
bycle_df.weather.unique()

#atemp :: clear
bycle_df.atemp.apply(lambda x: check_if_empty_ints(x))#results show NO missing values
bycle_df.atemp.apply(lambda x: check_if_less_than(x, 0.1))#results show NO corrupt values
bycle_df.atemp.unique()

#-humidity :: NOT CLEAR! --> NO MISSING DATA: HOWEVER ---> CORRUPT DATA: YES 
bycle_df.humidity.apply(lambda x: check_if_empty_ints(x)) #results show NO missing values
bycle_df.humidity.unique() #results show YES Corrupt Data, as it's impossible to have 0 humidity on earth

#windspeed :: clear
bycle_df.windspeed.apply(lambda x: check_if_empty_ints(x)) #results show NO missing values
bycle_df.windspeed.unique() #results show No Missing Data, as it is possible to have 0 windspeed

#hour :: clear
bycle_df.hour.apply(lambda x: check_if_empty_ints(x)) #results show NO missing values
bycle_df.hour.apply(lambda x: check_if_less_than_x_greater_than_y(int(x), 0, 23))#results show NO corrupt data values
bycle_df.hour.unique() #results show No Missing Data,

#day_of_year :: clear
bycle_df.day_of_year.apply(lambda x: check_if_empty_ints(x)) #results show NO missing values
bycle_df.day_of_year.apply(lambda x: check_if_less_than_x_greater_than_y(int(x), int(0), int(365)))#results show NO corrupt data values
bycle_df.day_of_year.unique() #results show No Missing Data,

#Weekday :: clear
bycle_df.weekday.apply(lambda x: check_if_empty_ints(x)) #results show NO missing values
bycle_df.weekday.apply(lambda x: check_if_less_than_x_greater_than_y(int(x), int(0), int(6)))#results show NO corrupt data values
bycle_df.weekday.unique() #results show No Missing Data,
print('testing done')


testing done


In [13]:
print(bycle_df.shape)
bycle_df.head()

(10886, 15)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,hour,day_of_year,weekday,year,month
0,1/1/11 0:00,1,0,0,1,9.84,14.395,81,0.0,16,0,1,6,2011,1
1,1/1/11 1:00,1,0,0,1,9.02,13.635,80,0.0,40,1,1,6,2011,1
2,1/1/11 2:00,1,0,0,1,9.02,13.635,80,0.0,32,2,1,6,2011,1
3,1/1/11 3:00,1,0,0,1,9.84,14.395,75,0.0,13,3,1,6,2011,1
4,1/1/11 4:00,1,0,0,1,9.84,14.395,75,0.0,1,4,1,6,2011,1


### Fixing Corrupt Data - By replacing it with column average

In [14]:
#fixing humidity by adding average of the column to all zero values (as zero is impossible value it means the data is just missing)
import numpy as np
def nan_if_zero(value):
    if value == 0:
        return np.nan
    return value


def mean_if_nan(value, mean):
    if value == 0:
        return mean
    else:
        return value
    
    
#verify corrupt data
bycle_df['humidity'].unique() 

#changes zero corrupt data to numpy.nan value
random_var = bycle_df.humidity.apply(lambda x: nan_if_zero(x))

#finds mean of column, ignores nan values by default
mean_humid = random_var.mean() 


#apply mean to all zero corrupt data values in original df column humidity
bycle_df['humidity'] = bycle_df.humidity.apply(lambda x: mean_if_nan(int(x), int(mean_humid)))

#DATA IS CLEARED NOW --> NO CORRUPTION IN HUMIDITY COLUMN
bycle_df['humidity'].unique()



array([ 81,  80,  75,  86,  76,  77,  72,  82,  88,  87,  94, 100,  71,
        66,  57,  46,  42,  39,  44,  47,  50,  43,  40,  35,  30,  32,
        64,  69,  55,  59,  63,  68,  74,  51,  56,  52,  49,  48,  37,
        33,  28,  38,  36,  93,  29,  53,  34,  54,  41,  45,  92,  62,
        58,  61,  60,  65,  70,  27,  25,  26,  31,  73,  21,  24,  23,
        22,  19,  15,  67,  10,   8,  12,  14,  13,  17,  16,  18,  20,
        85,  83,  84,  78,  79,  89,  97,  90,  96,  91])

In [15]:
#Numerical features - scaled

### One Hot Encode categorical features that are non-binary

### Looking at all possible values for all columns that need one Hot Encoding

In [16]:
#categorical features
cat_features = ['season', 'holiday','workingday', 'weather', 'hour', 'month']


#looking for non-binary categorical features to OneHotEncode
non_binary_cat_features = {}
for x in cat_features:
    if len(bycle_df[x].unique()) > 2: #only if they have 3 or more possible values
        non_binary_cat_features[x] = bycle_df[x].unique()

print('The following need to go through One Hot Encoded Transformation:\n')
for x,y in non_binary_cat_features.items():
    print(x, 'has the features ', y)
    
#features that do no not need to be one hot encoded
non_encoded_features = ['holiday','workingday', 'temp','atemp', 'humidity', 'windspeed', 'day_of_year', 'weekday', 'year']



The following need to go through One Hot Encoded Transformation:

season has the features  [1 2 3 4]
weather has the features  [1 2 3 4]
hour has the features  ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15'
 '16' '17' '18' '19' '20' '21' '22' '23']
month has the features  ['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12']


Column 'Year needs to be changed to binary form'

In [17]:
def year_to_bin(value):
    if value == '2011':
        return 0
    return 1

bycle_df['year']= bycle_df.year.apply(lambda x : year_to_bin(x))

In [18]:
bycle_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,hour,day_of_year,weekday,year,month
0,1/1/11 0:00,1,0,0,1,9.84,14.395,81,0.0,16,0,1,6,0,1
1,1/1/11 1:00,1,0,0,1,9.02,13.635,80,0.0,40,1,1,6,0,1
2,1/1/11 2:00,1,0,0,1,9.02,13.635,80,0.0,32,2,1,6,0,1
3,1/1/11 3:00,1,0,0,1,9.84,14.395,75,0.0,13,3,1,6,0,1
4,1/1/11 4:00,1,0,0,1,9.84,14.395,75,0.0,1,4,1,6,0,1


### Column 'hour' will be put into 4 bins of 6 hours. This is reduce the number of features that will be created due to OneHotEncoding. Features created will be 4 instead of 24.

In [19]:
#HELPER FUNCTION TO PUT HOUR VALUES INTO BINS
def four_hour_bins(hour):
    hour = int(hour)
    if hour <=5:
        return 1
    elif hour <=11:
        return 2
    elif hour <=17:
        return 3
    else:
        return 4

bycle_df['hour'] = bycle_df.hour.apply(lambda x : four_hour_bins(x))
print('The possible choicse of column hour after putting into bins')
print(bycle_df['hour'].unique())

The possible choicse of column hour after putting into bins
[1 2 3 4]


In [20]:
bycle_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,hour,day_of_year,weekday,year,month
0,1/1/11 0:00,1,0,0,1,9.84,14.395,81,0.0,16,1,1,6,0,1
1,1/1/11 1:00,1,0,0,1,9.02,13.635,80,0.0,40,1,1,6,0,1
2,1/1/11 2:00,1,0,0,1,9.02,13.635,80,0.0,32,1,1,6,0,1
3,1/1/11 3:00,1,0,0,1,9.84,14.395,75,0.0,13,1,1,6,0,1
4,1/1/11 4:00,1,0,0,1,9.84,14.395,75,0.0,1,1,1,6,0,1


#### Actual OHE Proces

In [21]:
#One hot encoding

from sklearn.preprocessing import OneHotEncoder
#create OneHotEncoder object for each one
one_hot_encod_season = OneHotEncoder()
one_hot_encod_weather = OneHotEncoder()
one_hot_encod_hour = OneHotEncoder()
one_hot_encod_month = OneHotEncoder()



#transform values to OneHotEncoding values with new columns for each feature
x_season = one_hot_encod_season.fit_transform(bycle_df.season.values.reshape(-1,1)).toarray()
x_weather = one_hot_encod_weather.fit_transform(bycle_df.weather.values.reshape(-1,1)).toarray()
x_hour = one_hot_encod_weather.fit_transform(bycle_df.weather.values.reshape(-1,1)).toarray()
x_month = one_hot_encod_month.fit_transform(bycle_df.month.values.reshape(-1,1)).toarray()

print(x_hour)

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In [22]:
#Make a data data frame for each categorical feature that was one hot encoded using numpy results and proper column names
df_bycle_ohe_season = pd.DataFrame(x_season, columns = [' spring,', 'summer', 'fall', 'winter'])
df_bycle_ohe_weather = pd.DataFrame(x_weather, columns = [' clear', 'mist', 'light', 'heavy_rain'])
df_bycle_ohe_hour = pd.DataFrame(x_hour, columns = [' EarlyMorning', 'Morning', 'Evening', 'Night'])
df_bycle_ohe_month = pd.DataFrame(x_month, columns = ['Jan', 'Feb', 'March', 'April', 'May', 'June', 'July', 'August', 'Sept','Oct','Nov','Dec'])



print(df_bycle_ohe_season[:300:50])
print('\n\n\n')
print(df_bycle_ohe_weather[:300:50])

print('\n\n\n')
print(df_bycle_ohe_hour[:300:50])

print('\n\n\n')
print(df_bycle_ohe_month[:300:50])

#concatenate all the individual one hot encoded dataframes into one dataframe
df_bycle_ohe_feature_matrix = pd.concat([df_bycle_ohe_season,df_bycle_ohe_weather ], axis=1)
df_bycle_ohe_feature_matrix = pd.concat([df_bycle_ohe_feature_matrix, df_bycle_ohe_hour ], axis=1)
df_bycle_ohe_feature_matrix = pd.concat([df_bycle_ohe_feature_matrix, df_bycle_ohe_month ], axis=1)



#print out new df containing onehotencoding columns and values
print('\n\n\nOneHotEncoded DF:\n\n')
print(df_bycle_ohe_feature_matrix[:300:50])
non_encoded_feature_matrix = bycle_df[non_encoded_features]

      spring,  summer  fall  winter
0         1.0     0.0   0.0     0.0
50        1.0     0.0   0.0     0.0
100       1.0     0.0   0.0     0.0
150       1.0     0.0   0.0     0.0
200       1.0     0.0   0.0     0.0
250       1.0     0.0   0.0     0.0




      clear  mist  light  heavy_rain
0       1.0   0.0    0.0         0.0
50      1.0   0.0    0.0         0.0
100     1.0   0.0    0.0         0.0
150     0.0   1.0    0.0         0.0
200     1.0   0.0    0.0         0.0
250     0.0   0.0    1.0         0.0




      EarlyMorning  Morning  Evening  Night
0              1.0      0.0      0.0    0.0
50             1.0      0.0      0.0    0.0
100            1.0      0.0      0.0    0.0
150            0.0      1.0      0.0    0.0
200            1.0      0.0      0.0    0.0
250            0.0      0.0      1.0    0.0




     Jan  Feb  March  April  May  June  July  August  Sept  Oct  Nov  Dec
0    1.0  0.0    0.0    0.0  0.0   0.0   0.0     0.0   0.0  0.0  0.0  0.0
50   1.0  0.0    0.0 

### Features & Labels

#### New Feature Matrix-- Combining numerical and OneHotEncoded Featuress

In [23]:
#Labels
label = bycle_df['count']
print(label.shape)
print('label is count: \n')
print(label[0:300:50])

#combine non encoded and encoded feature matrices
bycle_feature_matrix = pd.concat([non_encoded_feature_matrix, df_bycle_ohe_feature_matrix], axis = 1)


#columns of new feature matrix
print('\n\nfeature matrix columns')
print('column names are \n', bycle_feature_matrix.columns)

#print(bycle_feature_matrix.head())
#df_to_be_used_later = bycle_feature_matrix.copy()
#df_to_be_used_later = pd.concat([df_to_be_used_later ,df_bycle_ohe_weather ], axis=1)
#df_to_be_used_later.head()

(10886,)
label is count: 

0       16
50       3
100    115
150     73
200     82
250     51
Name: count, dtype: int64


feature matrix columns
column names are 
 Index(['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed',
       'day_of_year', 'weekday', 'year', ' spring,', 'summer', 'fall',
       'winter', ' clear', 'mist', 'light', 'heavy_rain', ' EarlyMorning',
       'Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May',
       'June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],
      dtype='object')


#### We will make a Models Based Of Each Month instead of a model for every month in the year, so a df split will created for each month

In [24]:
#first bring it back together, so labels are paired off correctly with f matrix as data is filtered and split
df_with_label = pd.concat([bycle_feature_matrix, label], axis = 1)
df_with_label.head()

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,day_of_year,weekday,year,"spring,",...,April,May,June,July,August,Sept,Oct,Nov,Dec,count
0,0,0,9.84,14.395,81,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
1,0,0,9.02,13.635,80,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
2,0,0,9.02,13.635,80,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32
3,0,0,9.84,14.395,75,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
4,0,0,9.84,14.395,75,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [25]:
df_splits = []
months = ['Jan', 'Feb', 'March', 'April', 'May', 'June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec']

#0- feature_matrix 1-label
for month in months:
    df_month_fmatrix = df_with_label[df_with_label[month] == 1.0]#filter out rows for only that month

    label_month = df_month_fmatrix['count'] #labels are extracted for filtered rows
    
    del df_month_fmatrix['count'] #label is dropped from feature matrix
    df_splits.append([df_month_fmatrix, label_month]) #feature matrix and labels are put into a list
print('\n\n\n\n')

for x in range(len(df_splits)):
    print('Month: ', months[x])
    print('shape fmatrix', df_splits[x][0].shape)
    print('shape label is', df_splits[x][1].shape)
    print('\n')
    







Month:  Jan
shape fmatrix (884, 33)
shape label is (884,)


Month:  Feb
shape fmatrix (901, 33)
shape label is (901,)


Month:  March
shape fmatrix (901, 33)
shape label is (901,)


Month:  April
shape fmatrix (909, 33)
shape label is (909,)


Month:  May
shape fmatrix (912, 33)
shape label is (912,)


Month:  June
shape fmatrix (912, 33)
shape label is (912,)


Month:  July
shape fmatrix (912, 33)
shape label is (912,)


Month:  August
shape fmatrix (912, 33)
shape label is (912,)


Month:  Sept
shape fmatrix (909, 33)
shape label is (909,)


Month:  Oct
shape fmatrix (911, 33)
shape label is (911,)


Month:  Nov
shape fmatrix (911, 33)
shape label is (911,)


Month:  Dec
shape fmatrix (912, 33)
shape label is (912,)




### NORMALIZING DATA

#### Normalize data for each month

In [26]:
from sklearn import preprocessing

#normalize data
for x in range(len(df_splits)):
    #scale it -> d type changes to numpy array
    scaled_feature_matrix_month_numpyarray = preprocessing.scale(df_splits[x][0])

    #change back to df
    df_month_scaled = pd.DataFrame(scaled_feature_matrix_month_numpyarray, columns = df_splits[x][0].columns)
    
    #store back the scaled data back into list.
    df_splits[x][0] = df_month_scaled

print('sample of list of stored monthly dataframes and label\n\n')
print(df_splits[0][0].head())
print(df_splits[0][1][:5:])

sample of list of stored monthly dataframes and label


    holiday  workingday          temp     atemp  humidity  windspeed  \
0 -0.295518   -1.283241 -4.389351e-16  0.479102  1.436178    -1.7049   
1 -0.295518   -1.283241 -2.026208e-01  0.321980  1.378933    -1.7049   
2 -0.295518   -1.283241 -2.026208e-01  0.321980  1.378933    -1.7049   
3 -0.295518   -1.283241 -4.389351e-16  0.479102  1.092707    -1.7049   
4 -0.295518   -1.283241 -4.389351e-16  0.479102  1.092707    -1.7049   

   day_of_year   weekday      year   spring, ...   March  April  May  June  \
0    -1.640371  1.600362 -1.025204       0.0 ...     0.0    0.0  0.0   0.0   
1    -1.640371  1.600362 -1.025204       0.0 ...     0.0    0.0  0.0   0.0   
2    -1.640371  1.600362 -1.025204       0.0 ...     0.0    0.0  0.0   0.0   
3    -1.640371  1.600362 -1.025204       0.0 ...     0.0    0.0  0.0   0.0   
4    -1.640371  1.600362 -1.025204       0.0 ...     0.0    0.0  0.0   0.0   

   July  August  Sept  Oct  Nov  Dec  
0  

## ALGORITHM 1: Linear Regression

In [27]:
bycle_feature_matrix.head()

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,day_of_year,weekday,year,"spring,",...,March,April,May,June,July,August,Sept,Oct,Nov,Dec
0,0,0,9.84,14.395,81,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,9.02,13.635,80,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,9.02,13.635,80,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,9.84,14.395,75,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,9.84,14.395,75,0.0,1,6,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
print(bycle_feature_matrix.columns)

Index(['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed',
       'day_of_year', 'weekday', 'year', ' spring,', 'summer', 'fall',
       'winter', ' clear', 'mist', 'light', 'heavy_rain', ' EarlyMorning',
       'Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May',
       'June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],
      dtype='object')


### X_train, X_test, y_train, y_test for each month of year - going to be used from here on out, for any individual runs

In [29]:
from sklearn.model_selection import train_test_split
monthly_train_test_splits = []
for month in df_splits:
    X_train, X_test, y_train, y_test = train_test_split(month[0], month[1], test_size=0.25, 
                                                    random_state=4)
    monthly_train_test_splits.append([X_train, X_test, y_train, y_test])


In [30]:
print('sample of monthly split for first month')
print('X_train shape is', monthly_train_test_splits[0][0].shape)
print('y_train shape is',monthly_train_test_splits[0][2].shape )

print('X_test shape is', monthly_train_test_splits[0][1].shape)
print('y_test shape is', monthly_train_test_splits[0][3].shape)

sample of monthly split for first month
X_train shape is (663, 33)
y_train shape is (663,)
X_test shape is (221, 33)
y_test shape is (221,)


### INDIVIDUAL RUN

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
from sklearn import metrics
import numpy as np

#kaggle requested metric

def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))


#rmse, feature importance, predictions below

monthly_linreg_rmse = {}
monthly_f_importance = {}
monthly_predictions = {}
monthly_linreg_rmsle = {}


for x in range(len(monthly_train_test_splits)):
    
    #grab split data for each month
    X_train = monthly_train_test_splits[x][0]
    y_train = monthly_train_test_splits[x][2]

    X_test = monthly_train_test_splits[x][1]
    y_test = monthly_train_test_splits[x][3]
    
    #build model
    bycle_linreg = LinearRegression()
    bycle_linreg.fit(X_train, y_train);
    predictions = bycle_linreg.predict(X_test)
    mse = metrics.mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    monthly_linreg_rmse[months[x]] = rmse
    
    #save feature importance for each month
    monthly_f_importance[months[x]]  = pd.Series(bycle_linreg.coef_,index=bycle_feature_matrix.columns).sort_values(ascending=False)
    #monthly_predictions[months[x]] = predictions
    #monthly_linreg_rmsle[months[x]] = rmsle(y_test, predictions)
    #save predictions


#### Feature Importance

In [40]:
#print('\n\n\n')
print('The following are the feature importance generated by random forest model for each month\n\n')
for x,y in monthly_f_importance.items():
    print('month: ', x, '\n\nfeature_importance:\n\n', y)
    print('\n\n')


The following are the feature importance generated by random forest model for each month


month:  Jan 

feature_importance:

 fall             4.300189e+15
 spring,         2.227659e+15
summer           1.185847e+15
 EarlyMorning    1.978747e+14
light            1.487239e+14
temp             5.951250e+01
year             2.526871e+01
workingday       1.306216e+01
day_of_year      1.115595e+01
weekday          5.009303e+00
Nov              0.000000e+00
Dec              0.000000e+00
Feb              0.000000e+00
March            0.000000e+00
April            0.000000e+00
May              0.000000e+00
June             0.000000e+00
July             0.000000e+00
August           0.000000e+00
Sept             0.000000e+00
Oct              0.000000e+00
Jan              0.000000e+00
holiday         -1.590841e+00
windspeed       -1.052922e+01
atemp           -2.157055e+01
humidity        -2.354292e+01
Night           -4.309657e+13
heavy_rain      -4.309657e+13
Morning         -2.794156e+14
Eve

### Evaluating our results

#### Using RMSE Metric for evaulation

In [35]:
for x,y in monthly_linreg_rmse.items():
    print('month: ', x, ' rmse: ', y)


month:  Jan  rmse:  78.12547280174611
month:  Feb  rmse:  90.22789263037794
month:  March  rmse:  115.19562539528414
month:  April  rmse:  155.09382319471806
month:  May  rmse:  148.1150960524006
month:  June  rmse:  165.33814249763066
month:  July  rmse:  149.8270107386066
month:  August  rmse:  165.09932595068847
month:  Sept  rmse:  156.7198773792116
month:  Oct  rmse:  194.44283893928585
month:  Nov  rmse:  137.90105180752727
month:  Dec  rmse:  139.73712857878448


### CROSS VALIDATION RUN

In [44]:
from sklearn.cross_validation import cross_val_score

crossv_montly_lingreg_rmse_list = {}
crossv_montly_lingreg_rmse = {}


for month in range(len(df_splits)):
    #get entire feature matrix, not split for each month to fit
    feature_matrix_month = df_splits[month][0] 
    label_matrix = df_splits[month][1]
    
    linreg_cv = LinearRegression()
    mse_list= cross_val_score(linreg_cv,  feature_matrix_month, label_matrix, cv=10, scoring='neg_mean_squared_error')
    mse_list_positive = -mse_list
    rmse_list = np.sqrt(mse_list_positive)
    rmse_mean = rmse_list.mean()
    
    #save monthly rmse list
    crossv_montly_lingreg_rmse_list[months[month]] = rmse_list 
    
    #save monthly rmse mean
    crossv_montly_lingreg_rmse[months[month]] = rmse_mean
    
for x,y in crossv_montly_lingreg_rmse_list.items():
      print('\nmonth: ', x, '\nrmse_list: ', y)

for x,y in crossv_montly_lingreg_rmse.items():
      print('month: ', x, 'rmse: ', y)


month:  Jan 
rmse_list:  [4.23810483e+01 5.22501880e+01 4.61247971e+01 4.67036666e+01
 4.84760730e+01 8.71498224e+01 1.22336718e+02 1.31822405e+13
 8.65615359e+01 1.12460028e+02]

month:  Feb 
rmse_list:  [ 52.8493232   53.99261619  59.23828308  62.35929754  99.88326896
 126.55540899 126.12860562 102.11866293 127.82848786 128.10999846]

month:  March 
rmse_list:  [ 76.53253783  57.99664459  61.62916678  71.91980604 117.02023817
 132.1186043  147.71533399 143.23393094 188.94712142 207.74471477]

month:  April 
rmse_list:  [ 91.54059079 110.41326814 102.48417358  90.35949534 109.40050777
 194.98183854 190.38581753 179.91869528 179.57105372 198.58531322]

month:  May 
rmse_list:  [146.14929593 114.11194244 120.76213059 115.29622476 127.89813198
 189.82794144 179.98924817 168.32386307 155.5171911  191.46650132]

month:  June 
rmse_list:  [144.44029458 149.08046525 122.7400799  144.48614995 135.48074289
 189.81909565 226.10949302 190.29708258 201.6936396  181.44611049]

month:  July 
rmse_

### IMPROVING ACCURACY ATTEMPT - FEATURE REDUCTION -FAILED TO IMPROVE ACCURACY

#### Manual Feature Reduction... Checking RMSE With up to 7 best features.

#FEATURE IMPORTANCE- Best features below
    
#best 5 features for each month model
####     ['humidity','atemp','temp','windspeed','year' ] #1 - Jan
####     ['humidity','temp','windspeed','atemp','day_of_year']#Feb
####     ['temp','atemp','humidity','windspeed','year']#3 March
####    ['humidity','windspeed','temp','atemp','day_of_year']#April
####     ['atempt','humidity','windspeed','day_of_year','temp'],#May
####     ['humidity','windspeed','year', 'day_of_year', 'temp',],#6 June
####     ['temp','humidity','day_of_year','windspeed','atemp'],#July
####     ['temp','humidity','windspeed','day_of_year','year'],#August
####     ['humidity','atemp','windspeed','temp','day_of_year'],#9 Sept
####     ['humidity','windspeed','atemp','temp','day_of_year'],#10 - Oct
####     ['humidity','temp','windspeed','day_of_year','temp',],#11 - #Nov
####     ['humidity','temp','windspeed','day_of_year','atemp',],#12 - December

#### cross validation and feature reduction

In [50]:
#best 5 features for each month model    
best_features = [
    ['fall',' spring,','summer',' clear','winter' ],#1 - Jan
    [' spring,','fall','winter' ,' clear','Morning'],#Feb
    [' clear',' EarlyMorning','Morning','Evening','fall'],#3 March
    ['winter','Morning',' EarlyMorning','summer','mist'],#April
    [' EarlyMorning','fall','light','summer','winter'],#May
    [' spring,','mist',' clear', 'summer', 'winter'],#6 June
    ['winter',' clear','fall',' EarlyMorning','Evening'],#July
    [' EarlyMorning','mist','light','Morning',' clear'],#August
    [' EarlyMorning','Morning','light',' spring,',' clear'],#9 Sept
    ['summer','Morning','light',' clear',' EarlyMorning'],#10 - Oct
    [' spring,','winter','light',' clear','Morning'],#11 - Nov
    [' clear','mist',' spring,',' EarlyMorning','Morning']#12 - December
    
]


trial_features = [
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#1 - Jan
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#Feb
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#3 March
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#April
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#May
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#6 June
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#July
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#August
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#9 Sept
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#10 - Oct
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec'],#11 - Nov
    ['Morning', 'Evening', 'Night', 'Jan', 'Feb', 'March', 'April', 'May','June', 'July', 'August', 'Sept', 'Oct', 'Nov', 'Dec']#12 - December
    
]





In [53]:
from sklearn.cross_validation import cross_val_score

crossv_montly_linreg_feature_reduction_rmse_list = {}
crossv_montly_linreg_feature_reduction_rmse = {}

for month in range(len(df_splits)):
    linreg = LinearRegression()
    #line below filters out all other columsn other than 5 best for each model of each year
    mse_list= cross_val_score(linreg, df_splits[month][0][best_features[0]], df_splits[month][1], cv=10, scoring='neg_mean_squared_error')
    mse_list_positive = -mse_list
    rmse_list = np.sqrt(mse_list_positive)
    rmse_mean = rmse_list.mean()
    
    #save monthly rmse list
    crossv_montly_linreg_feature_reduction_rmse_list[months[month]] = rmse_list 
    
    #save monthly rmse mean
    crossv_montly_linreg_feature_reduction_rmse[months[month]] = rmse_mean
    
#for x,y in crossv_montly_forests_feature_reduction_rmse_list.items():
#      print('\nmonth: ', x, '\nrmse_list: ', y)

for x,y in crossv_montly_linreg_feature_reduction_rmse.items():
      print('month: ', x, 'rmse: ', y)

month:  Jan rmse:  90.57277837192207
month:  Feb rmse:  106.92361346654866
month:  March rmse:  148.54941413678984
month:  April rmse:  173.75996029096774
month:  May rmse:  185.56572652708155
month:  June rmse:  197.60406347197977
month:  July rmse:  183.06906287882663
month:  August rmse:  195.75255610839605
month:  Sept rmse:  204.5396578919325
month:  Oct rmse:  200.6493199724837
month:  Nov rmse:  164.28554518958583
month:  Dec rmse:  154.61531679540198


### IMPROVING ACCURACY ATTEMPT -ADA BOOST - SUCCESS

In [56]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [57]:
ada_boost_montly_rmse = {}
for x in range(len(monthly_train_test_splits)):

    ada_boost_reg = AdaBoostRegressor(LinearRegression(), n_estimators=100, random_state=3)
    ada_boost_reg.fit(monthly_train_test_splits[x][0], monthly_train_test_splits[x][2])
    abr_predictions = ada_boost_reg.predict(monthly_train_test_splits[x][1])

    mse = metrics.mean_squared_error(monthly_train_test_splits[x][3], abr_predictions)
    rmse = np.sqrt(mse)
    ada_boost_montly_rmse[months[x]] = rmse
    
    
for x,y in ada_boost_montly_rmse.items():
    print('\nmonth: \n', x , ' rmse: ', y, '\n')


month: 
 Jan  rmse:  92.41390202319975 


month: 
 Feb  rmse:  95.948115491617 


month: 
 March  rmse:  121.62460025061145 


month: 
 April  rmse:  156.20454410153874 


month: 
 May  rmse:  155.31635341927847 


month: 
 June  rmse:  165.93367853560795 


month: 
 July  rmse:  157.0769356016501 


month: 
 August  rmse:  164.69304090405916 


month: 
 Sept  rmse:  160.2289984760574 


month: 
 Oct  rmse:  196.85501358545656 


month: 
 Nov  rmse:  140.60682516042826 


month: 
 Dec  rmse:  143.36435547460036 



In [58]:
from statistics import mean
mean(ada_boost_montly_rmse.values())
print('averaging the rmse of all 12 models we get', mean(ada_boost_montly_rmse.values()))

averaging the rmse of all 12 models we get 145.85553025200878


### IMPROVING ACCURACY ATTEMPT -ADA BOOST CROSS VALIDATION- SUCCESS 

In [59]:
from sklearn.cross_validation import cross_val_score

crossv_adab_montly_forests_rmse_list = {}
crossv_adab_montly_forests_rmse = {}

for month in range(len(df_splits)):
    ada_boost = AdaBoostRegressor(LinearRegression(), n_estimators = 100, random_state = 6)
    mse_ada = cross_val_score(ada_boost, df_splits[month][0], df_splits[month][1], cv=10, scoring='neg_mean_squared_error')

    mse_ada_positive = - mse_ada
    rmse_ada_list = np.sqrt(mse_ada_positive)
    #print(rmse_ada)
    rmse_cv_ada= rmse_ada_list.mean()
    #print(accuracy_cv_ada)


    #save monthly rmse list
    crossv_adab_montly_forests_rmse_list[months[month]] = rmse_ada_list
    
    #save monthly rmse mean
    crossv_adab_montly_forests_rmse[months[month]] = rmse_cv_ada
    
#for x,y in crossv_adab_montly_forests_rmse_list.items():
#      print('month: ', x, '\nrmsle_list: ', y)

for x,y in crossv_adab_montly_forests_rmse.items():
      print('month: ', x, 'rmsle: ', y)

month:  Jan rmsle:  825661317259621.2
month:  Feb rmsle:  95.73515303980358
month:  March rmsle:  123.94493991913914
month:  April rmsle:  145.39844419977558
month:  May rmsle:  151.5715017888404
month:  June rmsle:  168.3275683480636
month:  July rmsle:  161.59782796386312
month:  August rmsle:  172.61403508418022
month:  Sept rmsle:  164.85482823790437
month:  Oct rmsle:  171.39311665642614
month:  Nov rmsle:  143.73667174810873
month:  Dec rmsle:  137.49693422146623


In [60]:
mean(crossv_adab_montly_forests_rmse.values())

68805109771771.49

### IMPROVING ACCURACY ATTEMPT - PCA - DIMENSIONALITY REDUCTION - SUCCESS

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca_montly_rmse = {}
for x in range(len(monthly_train_test_splits)):
    
    #grabbing data from split done early on
    X_train = monthly_train_test_splits[x][0]
    y_train = monthly_train_test_splits[x][2]
    
    X_test = monthly_train_test_splits[x][1]
    y_test = monthly_train_test_splits[x][3]
    
    
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)
    
    
    
    pca = PCA() #create pca object
    pca.fit(X_train)
    
    cpts = pd.DataFrame(pca.transform(X_train))
    
    #print(cpts) #still same amount of columns as training
    x_axis = np.arange(1, pca.n_components_+1)
    #print(x_axis) # still same amount of columns as training
    
    pca_scaled = PCA()
    pca_scaled.fit(X_train_scaled)
    #print(pca_scaled)
    cpts_scaled = pd.DataFrame(pca.transform(X_train_scaled))

    
    #from sklearn.ensemble import RandomForestRegressor
    
    linreg = LinearRegression()
    #print(X_train.head())
    linreg.fit(X_train, y_train)
    #linreg.fit(cpts, y_train)
    predicted_train = linreg.predict(X_train)
    predicted_test = linreg.predict(X_test)

    linreg_rmse= sqrt(mean_squared_error(y_test, predicted_test))
    
    pca_montly_rmse[months[x]] = linreg_rmse



for x,y in pca_montly_rmse.items():
    print('month is: ', x, ' rmse is ', y)
    #rf_ab= RandomForestRegressor(n_estimators = 100, random_state = 3)
    #ada_boost_reg = AdaBoostRegressor(RandomForestRegressor(n_estimators = 100, random_state = 3), n_estimators=100, random_state=3)
    #rf_ab.fit(X_train, y_train)
    #ada_boost_reg.fit(monthly_train_test_splits[x][0], monthly_train_test_splits[x][2])
    
    #rf_predictions = rf_ab.predict(X_test)
    #abr_predictions = ada_boost_reg.predict(monthly_train_test_splits[x][1])
    
    #rf_rmse= sqrt(mean_squared_error(y_test, rf_predictions))
    # abr_predictions_rmse =  sqrt(mean_squared_error(monthly_train_test_splits[x][3], abr_predictions))
    
    #mse = metrics.mean_squared_error(monthly_train_test_splits[x][3], abr_predictions)
    #rmse = np.sqrt(mse)
    #ada_boost_montly_rmse[months[x]] = rmse
    
    
    #for x,y in ada_boost_montly_rmse.items():
    #    print('\nmonth: \n', x , ' rmse: ', y, '\n')




month is:  Jan  rmse is  78.12547280174611
month is:  Feb  rmse is  90.22789263037794
month is:  March  rmse is  115.19562539528414
month is:  April  rmse is  155.09382319471806
month is:  May  rmse is  148.1150960524006
month is:  June  rmse is  165.33814249763066
month is:  July  rmse is  149.8270107386066
month is:  August  rmse is  165.09932595068847
month is:  Sept  rmse is  156.7198773792116
month is:  Oct  rmse is  194.44283893928585
month is:  Nov  rmse is  137.90105180752727
month is:  Dec  rmse is  139.73712857878448
