In [1]:
import numpy as np
import pandas as pd
import warnings

import calendar
from datetime import datetime

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
warnings.filterwarnings(action='ignore')

In [3]:
# 데이터 경로
raw_train = pd.read_csv('/content/train.csv') # 훈련 데이터
raw_test = pd.read_csv('/content/test.csv')   # 테스트 데이터

In [4]:
train = raw_train.copy()
test = raw_test.copy()

In [5]:
# 결측치 없음
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [6]:
# 결측치 없음
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


In [7]:
# Convert the 'datetime' column to datetime format
train['datetime'] = pd.to_datetime(train['datetime'])

# Create new columns for year, month, day, hour, minute, and second
# 분/초는 모두 0이므로 제외
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour

# Create a new 'date' column that combines 'year', 'month', and 'day'
train['date'] = pd.to_datetime(train[['year', 'month', 'day']])

# Create a new 'weekday' column
train['weekday'] = train['date'].dt.dayofweek.apply(lambda x: calendar.day_name[x])

# Drop the 'date' column as it's no longer needed
train = train.drop('date', axis=1)

# Check the first few rows of the dataframe to confirm the changes
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,weekday
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0,Saturday
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1,Saturday
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2,Saturday
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3,Saturday
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4,Saturday


In [8]:
# Convert the 'datetime' column to datetime format
test['datetime'] = pd.to_datetime(test['datetime'])

# Create new columns for year, month, day, hour, minute, and second
# 분/초는 모두 0이므로 제외
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour

# Create a new 'date' column that combines 'year', 'month', and 'day'
test['date'] = pd.to_datetime(test[['year', 'month', 'day']])

# Create a new 'weekday' column
test['weekday'] = test['date'].dt.dayofweek.apply(lambda x: calendar.day_name[x])

# Drop the 'date' column as it's no longer needed
test = test.drop('date', axis=1)

# Check the first few rows of the dataframe to confirm the changes
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,weekday
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,0,Thursday
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,1,Thursday
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,2,Thursday
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,Thursday
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,4,Thursday


In [9]:
train = pd.get_dummies(train, columns=['season', 'weather',
                                    #    'weekday'
                                       ])

In [10]:
test = pd.get_dummies(test, columns=['season', 'weather',
                                    #  'weekday'
                                     ])

In [11]:
print(train.shape)
train = train.loc[train['windspeed']>0]
print(train.shape)

(10886, 23)
(9573, 23)


In [13]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [14]:
def rmsle(y_true, y_pred, convertExp=True):
    # 지수변환
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)

    # 로그변환 후 결측값을 0으로 변환
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))

    # RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

In [15]:
training_cols = ['holiday', 'workingday', 'temp',
                #  'atemp',
                 'windspeed',
                #  'inverse_humidity',
                 'humidity',
                #  'is_weekend',
                #  'day',
                 'hour',
                #  'season', 'weather',
                 'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2',
                #  'weekday',
                #  'weekday_Friday', 'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday',
                #  'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday',
                 ]

In [16]:
X_train = train[training_cols]
y_train = train['count']
log_y = np.log1p(y_train)

X_test = test[training_cols]

In [17]:
XGB = XGBRegressor()
XGB.fit(X_train, log_y, eval_metric='rmsle')

In [18]:
predict = XGB.predict(X_test)

# predict = np.exp(predict)
predict = np.expm1(predict)

result = pd.DataFrame({
    'datetime': test['datetime'],
    'count': predict
})
result.to_csv('sampleSubmission.csv', index=False)

In [19]:
lgbm_reg = LGBMRegressor()
lgbm_reg.fit(X_train, log_y, eval_metric='rmsle')

predict = lgbm_reg.predict(X_test)
predict = np.expm1(predict)

result = pd.DataFrame({
    'datetime': test['datetime'],
    'count': predict
})
result.to_csv('sampleSubmission.csv', index=False)