In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_csv('../data/df_charac.csv')

data.drop(columns='index_y' ,inplace= True)
# nan 데이터 드랍.   

# dummy iqr 로 이상치 처리

def iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    df_clipped = df.copy()
    
    for column in columns:
        df_clipped[column] = df[column].clip(lower=lower[column], upper=upper[column])
        
    return df_clipped

# 한글 column명 영어로 변경
data.rename(columns={'축제기간(일)' : "festive_period"}, inplace=True)
data['festive_period'].fillna(data['festive_period'].mean(), inplace=True)


data = data.dropna()
data = iqr(data, ['visitors'])

# str 데이터 one_hot_encoding으로 분리
data = pd.get_dummies(data, columns=['target'], drop_first=False)
data = pd.get_dummies(data, columns=['month'], drop_first=False)

X = data[['festive_period',
          'cost', 
          'target_family', 
          'target_old', 
          'target_youth',
          'Fe_festival_conc',
          'non_festival_conc',
          'non_local',
          'non_foreigner',
          'month_1', 
          'month_2',
          'month_3', 
          'month_4', 
          'month_5', 
          'month_6', 
          'month_7', 
          'month_8',
          'month_9', 
          'month_10', 
          'month_11', 
          'month_12'
          ]]
Y = data['visitors']
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['festive_period'].fillna(data['festive_period'].mean(), inplace=True)


In [120]:
# 모델 평가 함수 제작. mse 와 R2score 사용.

from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(best_visitors, X_test, Y_test):
    # Visitors 모델 평가
    y_pred = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(Y_test, y_pred)
    r2_visitors = r2_score(Y_test, y_pred)


    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")


# RandomForestRegressor

In [121]:
from sklearn.ensemble import RandomForestRegressor

def train_models(X_train, Y_train):
    rf_visitors = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_visitors.fit(X_train, Y_train)

    return rf_visitors
rf_visitors = train_models(X_train, Y_train)

evaluate_models(rf_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2465579100.7675934
  R²: 0.6132436932433979


In [123]:
from sklearn.model_selection import GridSearchCV
# 그리드 서치 사용한 하이퍼파라미터 조정

def tune_rf(X_train, Y_train):
    param_grid_rf = {
        'n_estimators': np.arange(40, 60, 10),
        'max_depth': [17,15,16],
        'min_samples_split':  [3,4,5]}

    rf_visitors = RandomForestRegressor(random_state=42, n_jobs=-1)
    grid_search_rf_visitors = GridSearchCV(estimator=rf_visitors, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_visitors.fit(X_train, Y_train)
    
    print(f"Best visitors RF : {grid_search_rf_visitors.best_params_}")

    return grid_search_rf_visitors.best_estimator_


best_rf_visitors = tune_rf(X_train, Y_train)


Best visitors RF : {'max_depth': 15, 'min_samples_split': 4, 'n_estimators': 50}


In [124]:
# 모델 평가
evaluate_models(best_rf_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2544515907.380647
  R²: 0.6008614874632952


# Xgboost

In [125]:
import xgboost as xgb

def train_xgboost(X_train, Y_train):
    
    xgb_visitors = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_visitors.fit(X_train, Y_train)

    xgb_vicost = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_vicost.fit(X_train, Y_train)
    
    return xgb_visitors

xgb_visitors = train_xgboost(X_train, Y_train)

evaluate_models(xgb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 3118230966.5619373
  R²: 0.5108672474283589


In [126]:
from sklearn.model_selection import GridSearchCV
def tune_xgb(X_train, Y_train):
    param_grid_xgb = {
        'n_estimators': np.arange(90,110,10),
        'max_depth': [7,8, 9,10],
        'learning_rate': [0.03],
        'subsample': [0.75,0.8,0.85],
        'colsample_bytree': np.arange(0.5, 0.7, 0.1)}

    xgb_visitors = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    grid_search_xgb_visitors = GridSearchCV(estimator=xgb_visitors, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_visitors.fit(X_train, Y_train)
    
    print(f"Best xgb visitors: {grid_search_xgb_visitors.best_params_}")

    return grid_search_xgb_visitors.best_estimator_

best_xgb_visitors = tune_xgb(X_train, Y_train)

Best xgb visitors: {'colsample_bytree': 0.6, 'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}


In [127]:
evaluate_models(best_xgb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 2394817925.674364
  R²: 0.6243434509969799


In [128]:
from sklearn.model_selection import KFold, cross_val_score


kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(best_xgb_visitors, X, Y, cv=kf)

print("K-Fold Cross Validation Scores:", cv_scores)
print("Average K-Fold Score:", np.mean(cv_scores))


K-Fold Cross Validation Scores: [0.48906711 0.64378576 0.63624501 0.53677194 0.46643444 0.57923616
 0.62930938 0.55056228 0.40326712 0.57712856]
Average K-Fold Score: 0.5511807753691397


In [129]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_mse_scores = []
cv_r2_scores = []

for train_index, test_index in kf.split(X_train):

    X_train_fold, X_test_fold = X_train.iloc[train_index].values, X_train.iloc[test_index].values
    Y_train_fold, Y_test_fold = Y_train.iloc[train_index].values, Y_train.iloc[test_index].values
    
    # 모델을 학습시키고 예측하기 전에 이전 학습된 모델을 이어서 학습하도록 설정
    best_xgb_visitors.fit(X_train_fold, Y_train_fold, 
                          eval_set=[(X_test_fold, Y_test_fold)], 
                          verbose=False) 
    
    Y_pred_fold = best_xgb_visitors.predict(X_test_fold)
    
    # MSE 계산
    mse_fold = mean_squared_error(Y_test_fold, Y_pred_fold)
    cv_mse_scores.append(mse_fold)
    
    # R² 계산
    r2_fold = r2_score(Y_test_fold, Y_pred_fold)
    cv_r2_scores.append(r2_fold)
    
    print(f"Fold MSE: {mse_fold}, Fold R²: {r2_fold}")


evaluate_models(best_xgb_visitors, X_test, Y_test)
print(f"Average Cross-Validation MSE: {np.mean(cv_scores)}")
print(f"Average Cross-Validation R²: {np.mean(cv_r2_scores)}")

Fold MSE: 5275639017.519821, Fold R²: 0.46624749304310265
Fold MSE: 5094269651.532467, Fold R²: 0.49429385827207584
Fold MSE: 4498835673.15138, Fold R²: 0.5811488389089432
Fold MSE: 4657894355.041716, Fold R²: 0.5732180511948237
Fold MSE: 6385945568.520371, Fold R²: 0.46793532272756855
Visitors Model Performance:
  MSE: 2753022649.756929
  R²: 0.5681546488994076
Average Cross-Validation MSE: 0.5511807753691397
Average Cross-Validation R²: 0.5165687128293028


# GradientBoostingRegressor

In [130]:
from sklearn.ensemble import GradientBoostingRegressor

def train_gradient_boosting(X_train, Y_train):
    gb_visitors = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_visitors.fit(X_train, Y_train)

    return gb_visitors

gb_visitors = train_gradient_boosting(X_train, Y_train)

evaluate_models(gb_visitors, X_test, Y_test)


Visitors Model Performance:
  MSE: 3316822581.651091
  R²: 0.47971571812605507


In [131]:
from sklearn.model_selection import GridSearchCV
def tune_gb(X_train, Y_train):
    param_grid_gb = {
        'n_estimators': [190,200,210],
        'max_depth': [4, 5, 6],
        'learning_rate': [0.04, 0.02, 0.03],
        'subsample': [0.7, 0.75, 0.8]}
    
    gb_visitors = GradientBoostingRegressor(random_state=42)
    grid_search_gb_visitors = GridSearchCV(estimator=gb_visitors, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_visitors.fit(X_train, Y_train)

    print(f"Best gb visitors: {grid_search_gb_visitors.best_params_}")

    return grid_search_gb_visitors.best_estimator_

best_gb_visitors = tune_gb(X_train, Y_train)


Best gb visitors: {'learning_rate': 0.02, 'max_depth': 6, 'n_estimators': 210, 'subsample': 0.75}


In [101]:
evaluate_models(best_gb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 4227388971.0279307
  R²: 0.549926463541971


# RANSACRegressor

In [17]:
from sklearn.linear_model import RANSACRegressor

def train_ransac(X_train, Y_train):
    ransac_visitors = RANSACRegressor(random_state=42)
    ransac_visitors.fit(X_train, Y_train)
    
    return ransac_visitors

ransac_visitors = train_ransac(X_train, Y_train)

evaluate_models(ransac_visitors, X_test, Y_test)
    


Visitors Model Performance:
  MSE: 9101157440.954845
  R²: 0.031035435020349


In [19]:
def tune_rs(X_train, Y_train):
    param_grid_ransac = {
        'min_samples': [0.5, 0.7, 0.9],
        'residual_threshold': [5, 10, 20],
        'max_trials': [50, 100, 150],
    }
    ransac_visitors = RANSACRegressor(random_state=42)
    grid_search_ransac_visitors = GridSearchCV(estimator=ransac_visitors, param_grid=param_grid_ransac, cv=5, scoring='neg_mean_squared_error')
    grid_search_ransac_visitors.fit(X_train, Y_train)

    print(f"Best rs visitors: {grid_search_ransac_visitors.best_params_}")

    return grid_search_ransac_visitors.best_estimator_

best_rs_visitors = tune_rs(X_train, Y_train)

40 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 66, in inner_f
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "c:\anaconda3\Lib\site-packages\sklearn\linear_model\_ransac.py", line 572, in fit
    raise

Best rs visitors: {'max_trials': 150, 'min_samples': 0.7, 'residual_threshold': 10}




In [20]:
evaluate_models(best_rs_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 12637302227.055681
  R²: -0.34544404207896995
