In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
data = pd.read_csv('../data/df_charac.csv')
   
data = data.dropna()

def iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    df_clipped = df.copy()
    
    for column in columns:
        df_clipped[column] = df[column].clip(lower=lower[column], upper=upper[column])
        
    return df_clipped

data = iqr(data, ['visitors'])

data = pd.get_dummies(data, columns=['target'], drop_first=False)

X = data[['month', 
          'cost', 
          'target_family', 
          'target_old', 
          'target_youth',
          'Fe_festival_conc',
          'non_festival_conc',
          'non_local',
          'non_foreigner'
          ]]
Y = data['visitors']
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=42)


In [2]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(best_visitors, X_test, Y_test):
    # Visitors 모델 평가
    y_pred = best_visitors.predict(X_test)
    mse_visitors = mean_squared_error(Y_test, y_pred)
    r2_visitors = r2_score(Y_test, y_pred)


    # 결과 출력
    print(f"Visitors Model Performance:")
    print(f"  MSE: {mse_visitors}")
    print(f"  R²: {r2_visitors}")


RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

def train_models(X_train, Y_train):
    rf_visitors = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_visitors.fit(X_train, Y_train)

    return rf_visitors
rf_visitors = train_models(X_train, Y_train)

evaluate_models(rf_visitors, X_test, Y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

def tune_rf(X_train, y_train_visitors, y_train_vicost):
    param_grid_rf = {
        'n_estimators': np.arange(50, 90, 20),
        'max_depth': [3,15,17],
        'min_samples_split':  [2, 7,8]}

    rf_visitors = RandomForestRegressor(random_state=42, n_jobs=-1)
    grid_search_rf_visitors = GridSearchCV(estimator=rf_visitors, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
    grid_search_rf_visitors.fit(X_train, y_train_visitors)
    
    print(f"Best visitors RF : {grid_search_rf_visitors.best_params_}")

    return grid_search_rf_visitors.best_estimator_

best_rf_visitors = tune_rf(X_train, Y_train)


In [None]:
evaluate_models(best_rf_visitors, X_test, Y_test)

xgb

In [3]:
import xgboost as xgb

def train_xgboost(X_train, Y_train):
    
    xgb_visitors = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_visitors.fit(X_train, Y_train)

    xgb_vicost = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xgb_vicost.fit(X_train, Y_train)
    
    return xgb_visitors

xgb_visitors = train_xgboost(X_train, Y_train)

evaluate_models(xgb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 7799992862.926983
  R²: 0.16956532833284454


In [5]:
from sklearn.model_selection import GridSearchCV
def tune_xgb(X_train, Y_train):
    param_grid_xgb = {
        'n_estimators': np.arange(90,110,10),
        'max_depth': [10, 11, 12],
        'learning_rate': np.arange(0.02,0.04,0.01),
        'subsample': [0.65,0.7,0.75],
        'colsample_bytree': np.arange(0.6, 0.8, 0.1)}

    xgb_visitors = xgb.XGBRegressor(random_state=42, n_jobs=-1)
    grid_search_xgb_visitors = GridSearchCV(estimator=xgb_visitors, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
    grid_search_xgb_visitors.fit(X_train, Y_train)
    
    print(f"Best xgb visitors: {grid_search_xgb_visitors.best_params_}")

    return grid_search_xgb_visitors.best_estimator_

best_xgb_visitors = tune_xgb(X_train, Y_train)

Best xgb visitors: {'colsample_bytree': 0.7999999999999999, 'learning_rate': 0.03, 'max_depth': 11, 'n_estimators': 100, 'subsample': 0.7}


In [6]:
evaluate_models(best_xgb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 5590753632.578279
  R²: 0.40477437110117


GBR

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

def train_gradient_boosting(X_train, Y_train):
    gb_visitors = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_visitors.fit(X_train, Y_train)

    return gb_visitors

gb_visitors = train_gradient_boosting(X_train, Y_train)

evaluate_models(gb_visitors, X_test, Y_test)


Visitors Model Performance:
  MSE: 6430057859.69753
  R²: 0.3154169393028179


In [15]:
from sklearn.model_selection import GridSearchCV
def tune_gb(X_train, Y_train):
    param_grid_gb = {
        'n_estimators': [150,170,190],
        'max_depth': [4, 5, 6],
        'learning_rate': [0.015, 0.02, 0.025],
        'subsample': [0.6]}
    
    gb_visitors = GradientBoostingRegressor(random_state=42)
    grid_search_gb_visitors = GridSearchCV(estimator=gb_visitors, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')
    grid_search_gb_visitors.fit(X_train, Y_train)

    print(f"Best gb visitors: {grid_search_gb_visitors.best_params_}")

    return grid_search_gb_visitors.best_estimator_

best_gb_visitors = tune_gb(X_train, Y_train)


Best gb visitors: {'learning_rate': 0.02, 'max_depth': 5, 'n_estimators': 170, 'subsample': 0.6}


In [14]:
evaluate_models(best_gb_visitors, X_test, Y_test)

Visitors Model Performance:
  MSE: 5619698291.542319
  R²: 0.40169274669641364


RANSAC

In [None]:
from sklearn.linear_model import RANSACRegressor

def train_ransac(X_train, Y_train):
    ransac_visitors = RANSACRegressor(random_state=42)
    ransac_visitors.fit(X_train, y_train_visitors)
    
    return ransac_visitors

ransac_visitors = train_ransac(X_train, Y_train)

evaluate_models(ransac_visitors, X_test, Y_test)
    


In [None]:
def tune_rs(X_train, Y_train):
    param_grid_ransac = {
        'min_samples': [0.5, 0.7, 0.9],
        'residual_threshold': [5, 10, 20],
        'max_trials': [50, 100, 150],
    }
    ransac_visitors = RANSACRegressor(random_state=42, n_jobs=-1)
    grid_search_ransac_visitors = GridSearchCV(estimator=ransac_visitors, param_grid=param_grid_ransac, cv=5, scoring='neg_mean_squared_error')
    grid_search_ransac_visitors.fit(X_train, Y_train)

    print(f"Best rs visitors: {grid_search_ransac_visitors.best_params_}")

    return grid_search_ransac_visitors.best_estimator_

best_rs_visitors = tune_rs(X_train, Y_train)

In [None]:
evaluate_models(best_rs_visitors, X_test, Y_test)