In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<h2>Kaggle Bike Sharing Demand Dataset</h2>
<h4>Hands-on: Linear Regression with AWS Machine Learning Service</h4>
<h4>To download dataset, sign-in and download from this link: https://www.kaggle.com/c/bike-sharing-demand/data</h4>
<br>
Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek']<br>
Target Feature: ['count']<br>
Objective: <quote>You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)</quote>

In [None]:
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek']

In [None]:
df = pd.read_csv('train.csv', parse_dates=['datetime'])
df_test = pd.read_csv('test.csv', parse_dates=['datetime'])

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df_test.head()

In [None]:
# We need to convert datetime to numeric for training.
# Let's extract key features into separate numeric columns
def add_features(df):
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['dayofweek'] = df['datetime'].dt.dayofweek

In [None]:
add_features(df)
add_features(df_test)

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
df.corr()['count']

In [None]:
group_month = df.groupby(['month'])

In [None]:
average_by_month = group_month['count'].mean()

In [None]:
plt.plot(average_by_month.index,average_by_month)
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(np.arange(12))
plt.grid(True)
plt.title('Rental Count by Month')

In [None]:
group_year_month = df.groupby(['year','month'])

In [None]:
average_year_month = group_year_month['count'].mean()

In [None]:
for year in average_year_month.index.levels[0]:
    #print (year)
    #print(average_year_month[year])
    plt.plot(average_year_month[year].index,average_year_month[year],label=year)
    
plt.legend()    
plt.xlabel('month')
plt.ylabel('Count')
plt.grid(True)
plt.title('Rental Count by Year,Month')

In [None]:
plt.scatter(x=df.temp,y=df["count"],label='Temperature')
plt.grid(True)
plt.xlabel('Temperature')
plt.ylabel('Count')

In [None]:
plt.scatter(x=df.humidity,y=df["count"],label='Humidity')
plt.grid(True)
plt.xlabel('Humidity')
plt.ylabel('Count')

In [None]:
df.dtypes

In [None]:
# Save all data
df.to_csv('bike_all.csv',index=False,
          columns=columns)

## Training and Validation Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [None]:
rows, train, test

In [None]:
columns

In [None]:
# Write Training Set
df[:train].to_csv('bike_train.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df[train:].to_csv('bike_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Test Data has only input features
df_test.to_csv('bike_test.csv',index=False)

In [None]:
','.join(columns)

In [None]:
# Write Column List
with open('bike_train_column_list.txt','w') as f:
    f.write(','.join(columns))