https://www.kaggle.com/c/bike-sharing-demand/data?select=train.csv
# 자전거 대여량 예측 모델

datetime - 시간별 날짜 + 타임스탬프   
season - 1 = 봄, 2 = 여름, 3 = 가을, 4 = 겨울   
holiday - 해당 요일이 휴일인지 여부   
workingday - 해당 요일이 주말이나 휴일이 아닌지 여부    
weather - 1: 맑음, 구름 약간, 부분적으로 구름, 부분적으로 구름   
2: 안개 + 구름, 안개 + 깨진 구름, 안개 + 구름 약간, 안개   
3: 가벼운 눈, 가벼운 비 + 뇌우 + 흩어진 구름, 가벼운 비 + 흩어진 구름   
4: 폭우 + 얼음판 + 뇌우 + 안개, 눈 + 안개    
temp - 섭씨 온도   
atemp - 체감 온도   
섭씨 humid - 상대 습도   
windspeed - 풍속   
casual - 미등록된 사용자가 시작한 대여 수   
registered - 등록된 사용자가 시작한 대여 수   
count - 총 대여 수   

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

In [2]:
df = pd.read_csv("BikeSharingDemand/train.csv")
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


datetime 컬럼의 속성이 string 이어서 datetime으로 변경.   
그리고 연, 월, 일, 시 속성으로 나누어 관리

In [3]:
import datetime
df["datetime"] = pd.to_datetime(df["datetime"])

In [4]:
df["year"] = df["datetime"].dt.year
df["month"] = df["datetime"].dt.month
df["day"] = df["datetime"].dt.day
df["hour"] = df["datetime"].dt.hour

df = df[['year', 'month', 'day', 'hour', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']]

In [5]:
df

Unnamed: 0,year,month,day,hour,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011,1,1,0,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011,1,1,1,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011,1,1,2,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011,1,1,3,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011,1,1,4,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012,12,19,19,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012,12,19,20,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012,12,19,21,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012,12,19,22,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


casual, registered 의 합은 count 이므로 삭제.   
만약 나중에 사용해도 두 개의 데이터 프레임으로 분리하여 회원 비회원 이용자 별 특성을 확인 하는 용도

In [6]:
# 제거

print(sum(df["casual"]) + sum(df["registered"]) == sum(df["count"]))
df = df.drop(["casual", "registered"], axis=1)


True


holiday - 해당 요일이 휴일인지 여부   
workingday - 해당 요일이 주말이나 휴일이 아닌지 여부    
이므로 둘 중 하나의 값만 있으면 됨.
여기서는 holiday를 삭제하여 쉬는날, 안쉬는날로 구분 할 예정

In [7]:
df = df.drop("holiday", axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        10886 non-null  int32  
 1   month       10886 non-null  int32  
 2   day         10886 non-null  int32  
 3   hour        10886 non-null  int32  
 4   season      10886 non-null  int64  
 5   workingday  10886 non-null  int64  
 6   weather     10886 non-null  int64  
 7   temp        10886 non-null  float64
 8   atemp       10886 non-null  float64
 9   humidity    10886 non-null  int64  
 10  windspeed   10886 non-null  float64
 11  count       10886 non-null  int64  
dtypes: float64(3), int32(4), int64(5)
memory usage: 850.6 KB


In [9]:
df.describe()

Unnamed: 0,year,month,day,hour,season,workingday,weather,temp,atemp,humidity,windspeed,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2011.501929,6.521495,9.992559,11.541613,2.506614,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,191.574132
std,0.500019,3.444373,5.476608,6.915838,1.116174,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,181.144454
min,2011.0,1.0,1.0,0.0,1.0,0.0,1.0,0.82,0.76,0.0,0.0,1.0
25%,2011.0,4.0,5.0,6.0,2.0,0.0,1.0,13.94,16.665,47.0,7.0015,42.0
50%,2012.0,7.0,10.0,12.0,3.0,1.0,1.0,20.5,24.24,62.0,12.998,145.0
75%,2012.0,10.0,15.0,18.0,4.0,1.0,2.0,26.24,31.06,77.0,16.9979,284.0
max,2012.0,12.0,19.0,23.0,4.0,1.0,4.0,41.0,45.455,100.0,56.9969,977.0


In [10]:
df.corr()

Unnamed: 0,year,month,day,hour,season,workingday,weather,temp,atemp,humidity,windspeed,count
year,1.0,-0.004932,0.0018,-0.004234,-0.004797,-0.002482,-0.012548,0.061226,0.05854,-0.078606,-0.015221,0.260403
month,-0.004932,1.0,0.001974,-0.006818,0.971524,-0.003394,0.012144,0.257589,0.264173,0.204537,-0.150192,0.166862
day,0.0018,0.001974,1.0,0.001132,0.001729,0.009829,-0.00789,0.015551,0.011866,-0.011335,0.036157,0.019826
hour,-0.004234,-0.006818,0.001132,1.0,-0.006546,0.00278,-0.02274,0.14543,0.140343,-0.278011,0.146631,0.400601
season,-0.004797,0.971524,0.001729,-0.006546,1.0,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.163439
workingday,-0.002482,-0.003394,0.009829,0.00278,-0.008126,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,0.011594
weather,-0.012548,0.012144,-0.00789,-0.02274,0.008879,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.128655
temp,0.061226,0.257589,0.015551,0.14543,0.258689,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.394454
atemp,0.05854,0.264173,0.011866,0.140343,0.264744,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.389784
humidity,-0.078606,0.204537,-0.011335,-0.278011,0.19061,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.317371


In [11]:
# for i in df.drop("count", axis=1).columns:
#     print(i)
#     plt.bar(df[i], df["count"])
#     plt.show()

In [12]:
def score_func(x, y, model):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=142)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    nmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    nmse_cv = cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=5)
    r2_cv = cross_val_score(model, x, y, scoring='r2', cv=5)

    return model, pd.DataFrame(
                            {
                                "r2": [r2, np.mean(r2_cv)],
                                "neg_mean_squared_error": [nmse, -np.mean(nmse_cv)]
                            },
                            index=["score", "cross_val_score"]
                        )


In [13]:
def poly_func(x, y, model):
    results = []
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=142)
    for degree in range(1, 5):
        model_poly = Pipeline(
            [
                ('poly', PolynomialFeatures(degree=degree, include_bias=False)),
                ('Linear', model())
            ]
        )
        model_poly.fit(x_train, y_train)
        pred_poly = model_poly.predict(x_test)
        
        mse = mean_squared_error(y_test, pred_poly)
        rmse = np.sqrt(mse)

        r2 = r2_score(y_test, pred_poly)

        results.append(
            {
                'degree': degree,
                'MSE': mse,
                "RMSE": rmse,
                "r2": r2
            }
        )


    return pd.DataFrame(results)

In [14]:
def regulationCV_func(x, y, model):
    _df = score_func(x, y, model)
    print(model.alpha_)
    return _df

--------------
# 

## 전체를 넣었을 때
일단 전체 데이터를 넣고 돌려본다.

In [15]:
X = df.drop("count", axis=1).values
y = df["count"].values

poly_LinearRegression = poly_func(X, y, LinearRegression)
poly_LinearRegression

Unnamed: 0,degree,MSE,RMSE,r2
0,1,19819.5433,140.7819,0.389206
1,2,14907.147653,122.094831,0.540595
2,3,12200.415987,110.455493,0.624011
3,4,12759.801945,112.959293,0.606772


In [None]:
# poly_RandomForestRegressor = poly_func(X, y, RandomForestRegressor)
# poly_RandomForestRegressor

Unnamed: 0,degree,MSE,RMSE,r2
0,1,1919.055371,43.807024,0.940859
1,2,1933.609094,43.972822,0.940411
2,3,2001.948733,44.743142,0.938304
3,4,2125.636278,46.104623,0.934493


In [17]:
model, lr = score_func(X, y, LinearRegression())
lr

Unnamed: 0,r2,neg_mean_squared_error
score,0.389206,140.7819
cross_val_score,0.20804,21116.861906


In [None]:
# model, rf = score_func(X, y, RandomForestRegressor())
# rf

Unnamed: 0,r2,neg_mean_squared_error
score,0.940479,43.947504
cross_val_score,0.699059,5765.17325


In [19]:
model, el = score_func(X, y, ElasticNet())
el

Unnamed: 0,r2,neg_mean_squared_error
score,0.36383,143.6766
cross_val_score,0.110342,23532.079423


In [20]:
model, rd = score_func(X, y, Ridge())
rd

Unnamed: 0,r2,neg_mean_squared_error
score,0.389206,140.78193
cross_val_score,0.208085,21116.638581


In [21]:
model, ls = score_func(X, y, Lasso())
ls

Unnamed: 0,r2,neg_mean_squared_error
score,0.389073,140.797222
cross_val_score,0.215954,21031.130854


In [22]:
model, ridge_df = regulationCV_func(X, y, RidgeCV([0.001, 0.01, 0.1, 1, 10, 100], cv = 5))
ridge_df

10.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.389202,140.782392
cross_val_score,0.212122,21078.260358


In [23]:
model, lasso_df = regulationCV_func(X, y, LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv = 5))
lasso_df

0.1


Unnamed: 0,r2,neg_mean_squared_error
score,0.389249,140.776948
cross_val_score,0.214681,21052.282846


----------------------

# Month 만

In [24]:
X = df.drop(["count", "year", "day", "hour"], axis=1).values
y = df["count"].values

In [25]:
poly_lr_month = poly_func(X, y, LinearRegression)
poly_lr_month

Unnamed: 0,degree,MSE,RMSE,r2
0,1,24086.211419,155.197331,0.257717
1,2,22507.634572,150.025446,0.306365
2,3,21500.117798,146.629185,0.337415
3,4,21421.098426,146.359484,0.33985


In [26]:
# poly_Rf_month = poly_func(X, y, RandomForestRegressor)
# poly_Rf_month

Unnamed: 0,degree,MSE,RMSE,r2
0,1,21399.496997,146.285669,0.340516
1,2,21201.476467,145.607268,0.346618
2,3,20984.029682,144.858654,0.353319
3,4,20809.16705,144.253829,0.358708


In [27]:
model, month_lr = score_func(X, y, LinearRegression())
month_lr

Unnamed: 0,r2,neg_mean_squared_error
score,0.257717,155.197331
cross_val_score,-0.071653,28332.223398


In [28]:
model, month_rf = score_func(X, y, RandomForestRegressor())
month_rf

Unnamed: 0,r2,neg_mean_squared_error
score,0.338459,146.513634
cross_val_score,-0.251275,30967.406111


In [29]:
model, month_el = score_func(X, y, ElasticNet())
month_el

Unnamed: 0,r2,neg_mean_squared_error
score,0.257625,155.206938
cross_val_score,-0.056877,28057.680989


In [30]:
model, month_rd = score_func(X, y, Ridge())
month_rd

Unnamed: 0,r2,neg_mean_squared_error
score,0.257719,155.197121
cross_val_score,-0.071635,28331.893214


In [31]:
model, month_ls = score_func(X, y, Lasso())
month_ls

Unnamed: 0,r2,neg_mean_squared_error
score,0.257852,155.18325
cross_val_score,-0.06332,28176.993874


In [32]:
model, ridge_month_df = regulationCV_func(X, y, RidgeCV([0.001, 0.01, 0.1, 1, 10, 100], cv=5))
ridge_month_df

100.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.25786,155.182381
cross_val_score,-0.070089,28303.936564


In [33]:
model, lasso_month_df = regulationCV_func(X, y, LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=5))
lasso_month_df

0.1


Unnamed: 0,r2,neg_mean_squared_error
score,0.257803,155.188367
cross_val_score,-0.111547,28480.958315


--------------------
# day 만

In [34]:
X = df.drop(["count", "year", "month", "hour"], axis=1).values
y = df["count"].values

In [35]:
poly_lr_day = poly_func(X, y, LinearRegression)
poly_lr_day

Unnamed: 0,degree,MSE,RMSE,r2
0,1,24130.323374,155.339381,0.256358
1,2,22682.73952,150.6079,0.300969
2,3,22104.428112,148.67558,0.318791
3,4,22292.761782,149.307608,0.312987


In [36]:
# poly_rf_day = poly_func(X, y, RandomForestRegressor)
# poly_rf_day

Unnamed: 0,degree,MSE,RMSE,r2
0,1,20759.244883,144.080689,0.360247
1,2,20374.090258,142.737838,0.372116
2,3,20063.247077,141.644792,0.381696
3,4,19968.153509,141.308717,0.384626


In [37]:
model, day_lr = score_func(X, y, LinearRegression())
day_lr

Unnamed: 0,r2,neg_mean_squared_error
score,0.256358,155.339381
cross_val_score,-0.070309,28360.416476


In [38]:
day_rf = score_func(X, y, RandomForestRegressor())
day_rf

(RandomForestRegressor(),
                        r2  neg_mean_squared_error
 score            0.362407              143.837187
 cross_val_score -0.235888            31027.288464)

In [39]:
model, day_el = score_func(X, y, ElasticNet())
day_el

Unnamed: 0,r2,neg_mean_squared_error
score,0.25399,155.586511
cross_val_score,-0.055161,27932.907329


In [40]:
model, day_rd = score_func(X, y, Ridge())
day_rd

Unnamed: 0,r2,neg_mean_squared_error
score,0.256357,155.339401
cross_val_score,-0.0703,28360.197793


In [41]:
model, ridge_day_df = regulationCV_func(X, y, RidgeCV([0.001, 0.01, 0.1, 1, 10, 100], cv=5))
ridge_day_df

100.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.256336,155.341605
cross_val_score,-0.070368,28355.356789


In [42]:
model, lasso_day_df = regulationCV_func(X, y, LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=5))
lasso_day_df

1.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.256084,155.368008
cross_val_score,-0.113846,28583.021915


-----------
# hour 만

In [43]:
X = df.drop(["count", "year", "month", "day"], axis=1).values
y = df["count"].values

In [44]:
poly_lr_hour = poly_func(X, y, LinearRegression)
poly_lr_hour

Unnamed: 0,degree,MSE,RMSE,r2
0,1,21589.749241,146.934507,0.334652
1,2,16978.281727,130.300736,0.476767
2,3,14745.303883,121.430243,0.545583
3,4,14337.481465,119.739223,0.558151


In [None]:
# poly_rf_hour = poly_func(X, y, RandomForestRegressor)
# poly_rf_hour  

Unnamed: 0,degree,MSE,RMSE,r2
0,1,4971.228946,70.506943,0.846798
1,2,4836.745961,69.546718,0.850942
2,3,4865.861923,69.75573,0.850045
3,4,4868.028318,69.771257,0.849978


In [46]:
model, hour_lr = score_func(X, y, LinearRegression())
hour_lr

Unnamed: 0,r2,neg_mean_squared_error
score,0.334652,146.934507
cross_val_score,-0.009219,26021.950786


In [47]:
model, hour_rf = score_func(X, y, RandomForestRegressor())
hour_rf

Unnamed: 0,r2,neg_mean_squared_error
score,0.844848,70.954295
cross_val_score,0.380029,13072.948222


In [48]:
model, hour_el = score_func(X, y, ElasticNet())
hour_el

Unnamed: 0,r2,neg_mean_squared_error
score,0.333006,147.116203
cross_val_score,0.00374,25600.255589


In [49]:
model, hour_rd = score_func(X, y, Ridge())
hour_rd

Unnamed: 0,r2,neg_mean_squared_error
score,0.334652,146.934501
cross_val_score,-0.009212,26021.741226


In [None]:
model, ridge_hour_df = regulationCV_func(X, y, RidgeCV([0.001, 0.01, 0.1, 1, 10, 100, 1000], cv=5))
ridge_hour_df

100.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.334655,146.934212
cross_val_score,-0.009482,26022.113145


In [None]:
model, lasso_hour_df = regulationCV_func(X, y, LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000], cv=5))
lasso_hour_df

1.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.334586,146.9418
cross_val_score,-0.024856,25997.472589


-------------
# year 만 빼고

In [52]:
X = df.drop(["count", "year"], axis=1).values
y = df["count"].values

In [53]:
poly_lr_no_year = poly_func(X, y, LinearRegression)
poly_lr_no_year

Unnamed: 0,degree,MSE,RMSE,r2
0,1,21582.034931,146.908253,0.33489
1,2,16943.068475,130.165543,0.477853
2,3,14502.336989,120.425649,0.553071
3,4,14458.977132,120.245487,0.554407


In [None]:
# poly_rf_no_year = poly_func(X, y, RandomForestRegressor)
# poly_rf_no_year

In [None]:
model, no_year_lr = score_func(X, y, LinearRegression())
no_year_lr

Unnamed: 0,r2,neg_mean_squared_error
score,0.256358,155.339381
cross_val_score,-0.070309,28360.416476


In [None]:
no_year_rf = score_func(X, y, RandomForestRegressor())
no_year_rf

(RandomForestRegressor(),
                        r2  neg_mean_squared_error
 score            0.362407              143.837187
 cross_val_score -0.235888            31027.288464)

In [None]:
model, no_year_el = score_func(X, y, ElasticNet())
no_year_el

Unnamed: 0,r2,neg_mean_squared_error
score,0.25399,155.586511
cross_val_score,-0.055161,27932.907329


In [None]:
model, no_year_rd = score_func(X, y, Ridge())
no_year_rd

Unnamed: 0,r2,neg_mean_squared_error
score,0.256357,155.339401
cross_val_score,-0.0703,28360.197793


In [None]:
model, ridge_no_year_df = regulationCV_func(X, y, RidgeCV([0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], cv=5))
ridge_no_year_df

100.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.256336,155.341605
cross_val_score,-0.070368,28355.356789


In [None]:
model, lasso_no_year_df = regulationCV_func(X, y, LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=5))
lasso_no_year_df

1.0


Unnamed: 0,r2,neg_mean_squared_error
score,0.256084,155.368008
cross_val_score,-0.113846,28583.021915
