In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# matplotlib 한글 폰트 설정 
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False # 마이너스 폰트 깨짐 방지

In [8]:
df = pd.read_csv("../data/bike-sharing-demand/train.csv")

In [23]:
# 데이터프레임 기본 정보 확인
print(df.info())

# 결측값 확인
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   holiday     10886 non-null  int64  
 1   workingday  10886 non-null  int64  
 2   temp        10886 non-null  float64
 3   atemp       10886 non-null  float64
 4   humidity    10886 non-null  int64  
 5   windspeed   10886 non-null  float64
 6   count       10886 non-null  int64  
 7   year        10886 non-null  int32  
 8   month       10886 non-null  int32  
 9   day         10886 non-null  int32  
 10  hour        10886 non-null  int32  
 11  weekday     10886 non-null  int32  
 12  season_2    10886 non-null  bool   
 13  season_3    10886 non-null  bool   
 14  season_4    10886 non-null  bool   
 15  weather_2   10886 non-null  bool   
 16  weather_3   10886 non-null  bool   
 17  weather_4   10886 non-null  bool   
dtypes: bool(6), float64(3), int32(5), int64(4)
memory usage: 871.9 KB


In [9]:
# 1. datetime 변환
df['datetime'] = pd.to_datetime(df['datetime'])
# 연(year), 월(month), 일(day), 시간(hour), 요일(weekday) 정보 추출
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['weekday'] = df['datetime'].dt.dayofweek # 월요일: 0, 일요일: 6

In [10]:
df = pd.get_dummies(df, columns=['season', 'weather'], drop_first=True)

In [11]:
df = df.drop(['datetime', 'casual', 'registered'], axis=1)

In [13]:
X = df.drop(['count'], axis=1)
y = df['count']

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

In [17]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor()
model_gbr.fit(X_train, y_train)
y_pred_gbr = model_gbr.predict(X_test)

In [19]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

In [20]:
from lightgbm import LGBMRegressor

model_lgb = LGBMRegressor()
model_lgb.fit(X_train, y_train)
y_pred_lgb = model_lgb.predict(X_test)

In [21]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def evaluate(y_test, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse}")
    print(f"R² Score: {r2}\n")

evaluate(y_test, y_pred_lr, 'Linear Regression')
evaluate(y_test, y_pred_rf, 'Random Forest')
evaluate(y_test, y_pred_gbr, 'Gradient Boosting')
evaluate(y_test, y_pred_xgb, 'XGBoost')
evaluate(y_test, y_pred_lgb, 'LightGBM')

Linear Regression Performance:
RMSE: 140.63099804599423
R² Score: 0.4008205520112573

Random Forest Performance:
RMSE: 38.7429851981764
R² Score: 0.9545241367389056

Gradient Boosting Performance:
RMSE: 68.33733922654201
R² Score: 0.8585148299842553

XGBoost Performance:
RMSE: 37.82438764978134
R² Score: 0.956655040415417

LightGBM Performance:
RMSE: 36.97310004683724
R² Score: 0.9585841553828849

