In [0]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.style.use('ggplot')
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

In [5]:
train = pd.read_csv('bike_train.csv', parse_dates = ['datetime'])
train.shape

(10886, 12)

In [7]:
test = pd.read_csv('bike_test.csv', parse_dates = ['datetime'])
test.shape

(6493, 9)

In [9]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek
train.shape

(10886, 16)

In [10]:
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['hour'] = test['datetime'].dt.hour
test['dayofweek'] = test['datetime'].dt.dayofweek
test.shape

(6493, 13)

In [0]:
# 범주형 피처를 카테고리형으로 변경
categorical_feature_names = ['season', 'holiday', 'workingday' , 'weather', 'dayofweek', 'month', 'year', 'hour']
for var in categorical_feature_names:
    train[var] = train[var].astype('category')
    test[var] = test[var].astype('category')

In [15]:
feature_names = ['season', 'weather', 'temp', 'atemp', 'humidity',  'year', 'hour', 'dayofweek',
                            'holiday', 'workingday']
feature_names

['season',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'year',
 'hour',
 'dayofweek',
 'holiday',
 'workingday']

In [16]:
X_train = train[feature_names]
print(X_train.shape)
X_train.head()

(10886, 10)


Unnamed: 0,season,weather,temp,atemp,humidity,year,hour,dayofweek,holiday,workingday
0,1,1,9.84,14.395,81,2011,0,5,0,0
1,1,1,9.02,13.635,80,2011,1,5,0,0
2,1,1,9.02,13.635,80,2011,2,5,0,0
3,1,1,9.84,14.395,75,2011,3,5,0,0
4,1,1,9.84,14.395,75,2011,4,5,0,0


In [17]:
X_test = test[feature_names]
print(X_test.shape)
X_test.head()

(6493, 10)


Unnamed: 0,season,weather,temp,atemp,humidity,year,hour,dayofweek,holiday,workingday
0,1,1,10.66,11.365,56,2011,0,3,0,1
1,1,1,10.66,13.635,56,2011,1,3,0,1
2,1,1,10.66,13.635,56,2011,2,3,0,1
3,1,1,10.66,12.88,56,2011,3,3,0,1
4,1,1,10.66,12.88,56,2011,4,3,0,1


In [18]:
label_name = 'count'
y_train = train[label_name]
y_train.head()

0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

## RMSLE

In [19]:
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values):
    # 넘파이를 배열 형태로 변경
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값, 실제값에 1을 더하고 로그를 씌어줌
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    difference = log_predict - log_actual
    difference = np.square(difference)
    
    mean_difference = difference.mean()
    
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

make_scorer(rmsle)

In [27]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
import numpy as np
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore', category = DeprecationWarning)
IModel = LinearRegression()
y_train_log = np.log1p(y_train)
IModel.fit(X_train, y_train_log)

preds = IModel.predict(X_train)
print('RMSLE Value For Linear Regression:', rmsle(np.exp(y_train_log), np.exp(preds)))

RMSLE Value For Linear Regression: 0.9803697923313522


## 릿지

In [0]:
ridge_m_ = Ridge()
ridge_params = {'max_iter': [3000], 'alpha': [0.01, 0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 