In [2]:
import numpy as np
import pandas as pd

## Load Datas

In [4]:
train = pd.read_csv("./datas/train.csv", parse_dates=["datetime"])

print(train.shape)
train.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [50]:
train["datetime"].dt.month.tail()

10881    12
10882    12
10883    12
10884    12
10885    12
Name: datetime, dtype: int64

In [11]:
test = pd.read_csv("./datas/test.csv", parse_dates=["datetime"])

print(test.shape)
test.head()

(6493, 9)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [48]:
test["datetime"].dt.day.head()

0    20
1    20
2    20
3    20
4    20
Name: datetime, dtype: int64

## Train

In [15]:
feature_names = ["season", "holiday", "workingday", "weather", "temp", "atemp", "humidity", "windspeed"]

feature_names

['season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed']

In [17]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(10886, 8)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1,0,0,1,9.84,14.395,81,0.0
1,1,0,0,1,9.02,13.635,80,0.0
2,1,0,0,1,9.02,13.635,80,0.0
3,1,0,0,1,9.84,14.395,75,0.0
4,1,0,0,1,9.84,14.395,75,0.0


In [20]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(6493, 8)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1,0,1,1,10.66,11.365,56,26.0027
1,1,0,1,1,10.66,13.635,56,0.0
2,1,0,1,1,10.66,13.635,56,0.0
3,1,0,1,1,10.66,12.88,56,11.0014
4,1,0,1,1,10.66,12.88,56,11.0014


In [26]:
label_name = "count"

In [29]:
Y_train = train[label_name]

print(Y_train.shape)
Y_train.head()

(10886,)


0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

In [112]:
# scikit-learn => sklearn
# from sklearn.tree import DecisionTreeRegressor
# model = DecisionTreeRegressor(random_state=3)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=3)

model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=3, verbose=0, warm_start=False)

## Score <-- 중요함!

In [114]:
# cross_validation 패키지를 model_selection로 옮겼음. model_selection 사용하는게 좋음.
from sklearn.cross_validation import cross_val_score
# from sklearn.model_selection import cross_val_score

score = cross_val_score(model, X_train, Y_train, cv=20, scoring="neg_mean_absolute_error").mean()

# 0에 가까울 수록 좋고, 0에 멀수록 안 좋음!!
print("Score = {0:5f}".format(score))
# 결과가 음수가 나옴. 상관없음. 양수로 출력하고 싶으면 절대값을 하든, -1을 곱하든 하면 됨.

# DecisionTreeRegressor : -152정도
# RandomForestRegressor : -129정도

Score = -129.615747


In [115]:
model.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=3, verbose=0, warm_start=False)

In [116]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]

(6493,)


array([ 201.98333333,   74.3       ,   74.3       ,  111.6       ,
        111.6       ,   69.36666667,  136.        ,   59.9       ,
         99.        ,  100.24666667])

# Submit

In [117]:
submission = pd.read_csv("./datas/sampleSubmission.csv")

submission["count"] = predictions

print(submission.shape)
submission.head()

(6493, 2)


Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,201.983333
1,2011-01-20 01:00:00,74.3
2,2011-01-20 02:00:00,74.3
3,2011-01-20 03:00:00,111.6
4,2011-01-20 04:00:00,111.6


In [118]:
# index=False, csv 파일에서 0번째 컬럼을 없애기 위함.
submission.to_csv("./submission/baseline-script.csv", index=False)

In [136]:
# import datetime

# train["datetime"].dt.format("%Y%m%d")

AttributeError: 'DatetimeProperties' object has no attribute 'format'

In [135]:
train["datetime_year"] = train["datetime"].dt.year
train["datetime_month"] = train["datetime"].dt.month
train["datetime_day"] = train["datetime"].dt.day
train["datetime_hour"] = train["datetime"].dt.hour
train["datetime_minute"] = train["datetime"].dt.minute
train["datetime_second"] = train["datetime"].dt.second

print(train.shape)
train.head()

(10886, 19)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,converted_datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_minute,datetime_second
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,0 2011\n1 2011\n2 2011\n3...,2011,1,1,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,0 2011\n1 2011\n2 2011\n3...,2011,1,1,1,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,0 2011\n1 2011\n2 2011\n3...,2011,1,1,2,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,0 2011\n1 2011\n2 2011\n3...,2011,1,1,3,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,0 2011\n1 2011\n2 2011\n3...,2011,1,1,4,0,0


0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
10856    0
10857    0
10858    0
10859    0
10860    0
10861    0
10862    0
10863    0
10864    0
10865    0
10866    0
10867    0
10868    0
10869    0
10870    0
10871    0
10872    0
10873    0
10874    0
10875    0
10876    0
10877    0
10878    0
10879    0
10880    0
10881    0
10882    0
10883    0
10884    0
10885    0
Name: datetime, Length: 10886, dtype: int64