# 생성한 모델을 비교 분석

In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import numpy as np
import glob
import joblib
from sklearn.svm import SVC

In [2]:
data = pd.read_csv("../model_predict_data.csv")

# 10k 모델 불러오기

## CatBoost

In [3]:
cb_10k = CatBoostClassifier()
cb_10k.load_model("catboost_model_10k.cbm")

<catboost.core.CatBoostClassifier at 0x22d21eef1d0>

In [4]:
cb_10k_features = list(cb_10k.feature_names_)

범주형 변수 확인

In [5]:
cb_10k_cat_features = [cb_10k_features[i] for i in cb_10k.get_cat_feature_indices()]

In [6]:
cb_10k_cat_features

[]

## LightGBM

In [7]:
lgb_10k = lgb.Booster(model_file='lightgbm_model_10k.txt')

In [8]:
lgb_10k_features = lgb_10k.feature_name()

lightgbm은 범주형 변수를 사용하지 않았음

In [9]:
(lgb_10k.params or {}).get("objective", "").lower()

'binary'

# 5m 모델, 데이터 변환

In [10]:
svm_5m = joblib.load('svm_model_5m.joblib')
pca_1 = joblib.load('pca_1.joblib')
pca_2 = joblib.load('pca_2.joblib')

## 데이터 변환

사용 컬럼

In [11]:
use_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 
            'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 
            'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 
            'wk2_Holiday_ShowMean', 'opening_AudienceStd',
            'dist_big_flop', 'dist_big_hit', 'dist_small_flop', 'dist_small_hit',
            'Month', 'Pandemic', 'Grade', 'Main_Country']

In [12]:
data_svm = data[use_cols].copy()

In [13]:
log_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 
            'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 
            'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 
            'wk2_Holiday_ShowMean', 'opening_AudienceStd']

In [14]:
data_svm[log_cols] = np.log1p(data_svm[log_cols])

In [15]:
scaler = joblib.load('svm_scaler.joblib')

In [16]:
data_svm[log_cols] = scaler.transform(data_svm[log_cols])

In [17]:
data_svm.describe()

Unnamed: 0,wk1_Audience,wk1_AudiencePerShow,wk2_Audience,wk2_AudiencePerShow,Show_Change,opening_Ho_Retention,wk1_Holiday_AudienceMean,wk1_Holiday_ShowMean,wk2_Holiday_AudienceMean,wk2_Holiday_ShowMean,opening_AudienceStd,dist_big_flop,dist_big_hit,dist_small_flop,dist_small_hit,Month,Pandemic
count,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0
mean,-0.12051,-0.195763,-0.078637,-0.174922,0.101448,0.308105,-0.184546,-0.09641,-0.013701,0.092381,-0.162943,0.37931,0.517241,0.034483,0.103448,7.586207,2.0
std,1.164085,0.673526,1.203301,0.743353,1.07013,1.092788,1.213417,1.264681,1.152806,1.14854,1.182112,0.621852,0.508548,0.185695,0.309934,1.15007,0.0
min,-2.302474,-1.2829,-2.205181,-1.456706,-1.608772,-1.446642,-2.375028,-2.445312,-2.217078,-1.956005,-2.464185,0.0,0.0,0.0,0.0,5.0,2.0
25%,-0.941096,-0.727653,-0.92985,-0.948639,-0.682911,-0.377843,-0.896739,-0.873041,-0.858157,-0.742743,-0.966764,0.0,0.0,0.0,0.0,7.0,2.0
50%,-0.20648,-0.22013,-0.234749,-0.070393,-0.120301,0.310756,-0.3061,-0.001927,-0.106304,-0.00454,-0.102888,0.0,1.0,0.0,0.0,8.0,2.0
75%,0.831401,0.119673,0.856119,0.418088,0.723581,0.865898,0.673619,0.989968,0.861609,0.960456,0.773659,1.0,1.0,0.0,0.0,8.0,2.0
max,1.921818,1.314558,1.920317,0.898289,2.801633,3.48714,1.896309,1.868067,1.797128,1.944598,1.897899,2.0,1.0,1.0,1.0,12.0,2.0


In [18]:
sorted(data_svm["Month"].unique())

[5, 6, 7, 8, 12]

In [19]:
sorted(data_svm["Pandemic"].unique())

[2]

In [20]:
sorted(data_svm["Grade"].unique())

['12세이상관람가', '15세이상관람가', '전체관람가', '청소년관람불가']

In [21]:
sorted(data_svm["Main_Country"].unique())

['기타', '미국', '일본', '한국']

In [22]:
data_svm = pd.get_dummies(data_svm, columns=['Month', 'Pandemic', 'Grade', 'Main_Country'])

In [23]:
data_svm.columns

Index(['wk1_Audience', 'wk1_AudiencePerShow', 'wk2_Audience',
       'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention',
       'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean',
       'wk2_Holiday_AudienceMean', 'wk2_Holiday_ShowMean',
       'opening_AudienceStd', 'dist_big_flop', 'dist_big_hit',
       'dist_small_flop', 'dist_small_hit', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_12', 'Pandemic_2', 'Grade_12세이상관람가', 'Grade_15세이상관람가',
       'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_기타', 'Main_Country_미국',
       'Main_Country_일본', 'Main_Country_한국'],
      dtype='object')

In [24]:
result_onehot_cols = ['Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가', 'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본', 'Main_Country_한국']

In [25]:
missing_cols = [c for c in result_onehot_cols if c not in data_svm.columns]
if missing_cols:
    data_svm[missing_cols] = 0

In [26]:
onehot_cols = ['Month', 'Pandemic', 'Grade', 'Main_Country']
onehot_columns = [col for col in data_svm.columns if any(prefix in col for prefix in ['Month_', 'Pandemic_', 'Grade_', 'Main_Country_'])]
data_svm[onehot_columns] = data_svm[onehot_columns].astype(int)

In [27]:
pca_1_cols = ['wk2_AudiencePerShow', 'wk1_AudiencePerShow']
pca_2_cols = ['wk1_Holiday_AudienceMean','wk2_Holiday_AudienceMean', 
  'wk1_Audience',
  'opening_AudienceStd',
  'wk2_Audience',
  'wk2_Holiday_ShowMean',
  'wk1_Holiday_ShowMean']

In [28]:
df_pca_1 = pd.DataFrame(pca_1.transform(data_svm[pca_1_cols]), columns=["PC1"], index=data_svm.index)
df_pca_2 = pd.DataFrame(pca_2.transform(data_svm[pca_2_cols]), columns=["PC2"], index=data_svm.index)

In [29]:
data_svm.drop(columns=pca_1_cols + pca_2_cols, inplace=True)
data_svm = pd.concat([data_svm, df_pca_1, df_pca_2], axis=1)

In [30]:
origin_train_cols = ['Show_Change', 'opening_Ho_Retention', 'dist_big_flop', 'dist_big_hit',
       'dist_small_flop', 'dist_small_hit', 'Month_2', 'Month_3', 'Month_4',
       'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10',
       'Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가',
       'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본',
       'Main_Country_한국', 'PC1', 'PC2']

In [31]:
set(data_svm.columns) - set(origin_train_cols)

{'Grade_12세이상관람가', 'Main_Country_기타'}

In [32]:
data_svm.drop(columns=['Grade_12세이상관람가', 'Main_Country_기타'], inplace=True)

In [33]:
data_svm = data_svm[origin_train_cols]

# 예측 진행

In [34]:
data_cb_10k = data[cb_10k_features]
data_lgb_10k = data[lgb_10k_features]

## 10k

### CatBoost

In [35]:
cb_10k_pool = Pool(data_cb_10k)
cb_10k_proba = cb_10k.predict_proba(cb_10k_pool)[:, 1]
cb_10k_pred = cb_10k.predict(cb_10k_pool)

### LightGBM

In [36]:
lgbm_10k_proba = lgb_10k.predict(data_lgb_10k)
lgbm_10k_pred = (lgbm_10k_proba >= 0.5).astype(int)

### 결과

In [37]:
result_10k = pd.DataFrame({
    'Movie_Title': data['Movie_Title'],
    'cb_10k_pred': cb_10k_pred,
    'cb_10k_proba': np.round(cb_10k_proba, 4),
    'lgbm_10k_pred': lgbm_10k_pred,
    'lgbm_10k_proba': np.round(lgbm_10k_proba, 4)
})
result_10k

Unnamed: 0,Movie_Title,cb_10k_pred,cb_10k_proba,lgbm_10k_pred,lgbm_10k_proba
0,F1 더 무비,0,0.0,0,0.0012
1,강령: 귀신놀이,0,0.0,0,0.0007
2,극장판 귀멸의 칼날: 무한성편,0,0.0,0,0.0013
3,긴키 지방의 어느 장소에 대하여,0,0.0,0,0.0012
4,꼬마마법사 주니토니,0,0.0,0,0.0012
5,"나의 아픈, 사랑이야기",0,0.0182,0,0.0402
6,너는 나를 불태워,1,1.0,1,0.9984
7,노바디2,0,0.0,0,0.0009
8,노이즈,0,0.0,0,0.0009
9,더 폴: 디렉터스 컷,0,0.0,0,0.001


## 5k

In [38]:
svm_5m_pred = svm_5m.predict(data_svm)

In [39]:
result_5m = pd.DataFrame({
    'Movie_Title': data['Movie_Title'],
    'cb_5m_pred': svm_5m_pred
})

In [40]:
result_5m

Unnamed: 0,Movie_Title,cb_5m_pred
0,F1 더 무비,0
1,강령: 귀신놀이,0
2,극장판 귀멸의 칼날: 무한성편,0
3,긴키 지방의 어느 장소에 대하여,0
4,꼬마마법사 주니토니,0
5,"나의 아픈, 사랑이야기",0
6,너는 나를 불태워,0
7,노바디2,0
8,노이즈,0
9,더 폴: 디렉터스 컷,0
