In [66]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")

train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0,0,1,1


In [67]:
predictors = ["season", "holiday", "weather", "temp", "atemp", "humidity", "windspeed"]
targets = ["casual", "registered", "count"]

In [104]:
def splitDatetime(data, columndate) :
    datatime= pd.to_datetime(data[columndate])
    data['year'] = datatime.dt.year
    data['month'] = datatime.dt.month
    data['day'] = datatime.dt.month
    data['dayoftheweek'] = datatime.dt.dayofweek
    data['hour'] = datatime.dt.hour
    return data

def harmonize(data):
    data.loc[:,['temp', 'atemp', 'windspeed']]= data.loc[:,['temp', 'atemp', 'windspeed']].round(2)
    return

def normalizedata(data):
    for col in predictors:
        try:
            if data.loc[:, col].mean()>3:
                print(data.loc[:, col].max(), col)
                data.loc[:, col]=(data.loc[:, col]-data.loc[:, col].mean())/data.loc[:, col].std()
        except:
            pass


def RMSLE(predictions, targets):
    return np.sqrt(1/len(predictions)*sum(np.square(np.log(predictions+1)-(np.log(targets+1)))))
    
        
normalizedata(train)
harmonize(train)
splitDatetime(train, "datetime")
train.describe()
train.head()   



Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,dayoftheweek,hour
0,2011-01-01 00:00:00,1,0,0,1,-1.33,-1.09,0.993167,-1.57,3,13,16,2011,1,1,5,0
1,2011-01-01 01:00:00,1,0,0,1,-1.44,-1.18,0.941206,-1.57,8,32,40,2011,1,1,5,1
2,2011-01-01 02:00:00,1,0,0,1,-1.44,-1.18,0.941206,-1.57,5,27,32,2011,1,1,5,2
3,2011-01-01 03:00:00,1,0,0,1,-1.33,-1.09,0.681399,-1.57,3,10,13,2011,1,1,5,3
4,2011-01-01 04:00:00,1,0,0,1,-1.33,-1.09,0.681399,-1.57,0,1,1,2011,1,1,5,4


# Feature selection

In [105]:
predictors = ["season", "holiday", "weather", "temp", "atemp", "humidity", "windspeed"]
targets = ["casual", "registered", "count"]

print(train.columns)
corr = predictors
corr.append("count")
print(corr)
train.loc[:, corr].corr(method="kendall")
train.loc[:, corr].describe()

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'dayoftheweek', 'hour'],
      dtype='object')
['season', 'holiday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'count']


Unnamed: 0,season,holiday,weather,temp,atemp,humidity,windspeed,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,1.418427,0.000244,-6.5e-05,-1.423678e-16,-0.001191,191.574132
std,1.116174,0.166599,0.633839,0.999982,0.999597,1.0,1.000332,181.144454
min,1.0,0.0,1.0,-2.49,-2.7,-3.215711,-1.57,1.0
25%,2.0,0.0,1.0,-0.81,-0.82,-0.7735222,-0.71,42.0
50%,3.0,0.0,1.0,0.03,0.07,0.005899721,0.02,145.0
75%,4.0,0.0,2.0,0.77,0.87,0.7853216,0.51,284.0
max,4.0,1.0,4.0,2.67,2.57,1.980435,5.41,977.0


# Linear regression

In [103]:
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression
import numpy as np

predictors = ["season", "holiday", "weather", "temp", "atemp", "humidity", "windspeed"]
targets = ["count"]

trainingdata = pd.concat([train[predictors], train[targets]], axis=1)

kf = KFold(trainingdata.shape[0], n_folds=3, random_state=1)

# Initialize our algorithm class
alg = LinearRegression()

predictions = []
for training, test in kf:
    print(len(training), len(test))
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (trainingdata[predictors].iloc[training,:])
    # The target we're using to train the algorithm.
    train_target = trainingdata["count"].iloc[training]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(trainingdata[predictors].iloc[test,:])
    predictions.append(test_predictions)

predictions = np.concatenate(predictions, axis=0)
predictions = np.round(predictions, 0)

predictions[predictions<0]=0
    
print(RMSLE(predictions,train['count'].values))
print(RMSLE(train['count'].values,train['count'].values))

7257 3629
7257 3629
7258 3628
25005.3957375
1.51559338498
0.0
0.0


In [36]:
trainingdata.head()

Unnamed: 0,season,holiday,weather,temp,atemp,humidity,windspeed,count
0,1,0,1,-1.33,-1.09,0.993167,-1.57,16
1,1,0,1,-1.44,-1.18,0.941206,-1.57,40
2,1,0,1,-1.44,-1.18,0.941206,-1.57,32
3,1,0,1,-1.33,-1.09,0.681399,-1.57,13
4,1,0,1,-1.33,-1.09,0.681399,-1.57,1


# Data preprocessing: removing outliers, etc.

In [37]:
print(train.loc[train['hour']==0, 'count'].mean())

frames = []
for hour in range(0, 24, 1):
    threshold = train.loc[train['hour']==hour, 'count'].mean()+ 2*train.loc[train['hour']==hour, 'count'].std() #Average value + 2 standard deviation
#     print(threshold)
    frames.append(train.loc[train['count']<threshold, :])
    

# train = pd.concat(frames)


55.13846153846154
